diff --git a/lib/parhash.c b/lib/parhash.c index 6d3677b..ddf8981 100644 --- a/lib/parhash.c +++ b/lib/parhash.c @@ -1,503 +1,503 @@ -// -// ParHash.c -// Code shared with all the parallel hash implementations -// -// Copyright (c) Microsoft Corporation. Licensed under the MIT license. -// - -#include "precomp.h" - -VOID -SYMCRYPT_CALL -SymCryptParallelHashProcess_serial( - _In_ PCSYMCRYPT_PARALLEL_HASH pParHash, - _Inout_updates_bytes_( nStates * pHash->stateSize ) PVOID pStates, - SIZE_T nStates, - _Inout_updates_( nOperations ) PSYMCRYPT_PARALLEL_HASH_OPERATION pOperations, - SIZE_T nOperations, - _Out_writes_( cbScratch ) PBYTE pbScratch, - SIZE_T cbScratch ) -{ - SIZE_T i; - PSYMCRYPT_PARALLEL_HASH_OPERATION op; - PCSYMCRYPT_HASH pHash; - - pHash = pParHash->pHash; - op = pOperations; - - // - // Wipe the scratch space to detect erroneous callers. - // We do this so that callers that test on a non-parallel platform will work on a platform that does support - // parallel operations. - // - if( cbScratch < pParHash->parScratchFixed + nStates * SYMCRYPT_PARALLEL_HASH_PER_STATE_SCRATCH ) - { - SymCryptFatal( 'ps2s' ); - } - SymCryptWipeKnownSize( pbScratch, pParHash->parScratchFixed + nStates * SYMCRYPT_PARALLEL_HASH_PER_STATE_SCRATCH ); - - for( i=0; iiHash >= nStates ) - { - SymCryptFatal( 'ps2i' ); - return; // prefast doesn't understand that SymCryptFatal doesn't return despite the annotation - } - switch( op->hashOperation ) - { - case SYMCRYPT_HASH_OPERATION_APPEND: - (*pHash->appendFunc)( (PBYTE)pStates + pHash->stateSize * op->iHash, op->pbBuffer, op->cbBuffer ); - break; - - case SYMCRYPT_HASH_OPERATION_RESULT: - if( op->cbBuffer != pHash->resultSize ) - { - SymCryptFatal( 'ps2r' ); - } - (*pHash->resultFunc)( (PBYTE)pStates + pHash->stateSize * op->iHash, op->pbBuffer ); - break; - - default: - SymCryptFatal( 'ps2o'); - } - op++; - } -} - -// -// This function looks at a state and decides what to do. -// If it returns FALSE, then this state is done and no further processing is required. -// If it returns TRUE, the pbData/cbData have to be processed in parallel. -// This function is called again on the same state after the pbData/cbData have been processed. -// -// Internally, it keeps track of the next step to be taken for this state. -// the processingState keeps track of the next action to take. -// - -// -// An enum to keep track of the state of a request block -// -BOOLEAN -SYMCRYPT_CALL -SymCryptParallelHashSetNextWork( PCSYMCRYPT_PARALLEL_HASH pParHash, PSYMCRYPT_PARALLEL_HASH_SCRATCH_STATE pScratch ) -{ - PSYMCRYPT_COMMON_HASH_STATE pState; - PCSYMCRYPT_HASH pHash; - PCSYMRYPT_PARALLEL_HASH_OPERATION pOp; - SIZE_T bytesInBuffer; - SIZE_T todo; - BOOLEAN res; - - // Retrieve the state we will operate on. - pState = (PSYMCRYPT_COMMON_HASH_STATE) pScratch->hashState; - pHash = pParHash->pHash; - - // - // This is a state machine where some states have to iterate - // The loop allows them to use 'continue' for that. - // -#pragma warning( suppress: 4127 ) // conditional expression is constant - while( TRUE ) - { - // - // At this point, the processing state, pbData/cbData, and next pointer define what needs to be done. - // STATE_NEXT: cbData == 0 and we have to process the remaining operations. - // STATE_DATA_START: We are working on the next operation; the first BytesAlreadyProcessed have been hashed, - // and the hash state has an empty buffer. - // STATE_DATA_END: We are working on the next operation (an append), and pbData/cbData have whatever partial block remains - // after all the whole blocks have been processed. - // STATE_PAD2: We are working on the next operation (a result), and have processed the first half of a 2-block padding. - // STATE_RESULT: We are working on the next operation (a result), and have processed all the padding. - // - // The pState->dataLength is updated whenver we copy bytes from the append into the state's buffer, or when - // we return TRUE and process bulk data. - // - pOp = pScratch->next; - switch( pScratch->processingState ) - { - case STATE_NEXT: - - if( pOp == NULL ) - { - return FALSE; - } - - bytesInBuffer = pState->bytesInBuffer; - - // SYMCRYPT_ASSERT( pOp->cbBuffer < ((SIZE_T)-1)/2 ); // used during testing - - if( pOp->hashOperation == SYMCRYPT_HASH_OPERATION_APPEND ) - { - pState->dataLengthL += pOp->cbBuffer; - if( pState->dataLengthL < pOp->cbBuffer ) { - pState->dataLengthH ++; // This is almost-unreachable code as it requires 2^64 bytes to be hashed. - } - - if( bytesInBuffer > 0 ) - { - SYMCRYPT_ASSERT( pHash->inputBlockSize > bytesInBuffer ); - - todo = SYMCRYPT_MIN( pHash->inputBlockSize - bytesInBuffer, pOp->cbBuffer ); - memcpy( &pState->buffer[bytesInBuffer], pOp->pbBuffer, todo ); - pState->bytesInBuffer += (UINT32) todo; - if( pState->bytesInBuffer == pHash->inputBlockSize ) - { - // - // We filled the buffer; set it for processing. - // Remember the # bytes we did and set the next state to process the rest of the request. - // - pScratch->pbData = &pState->buffer[0]; - pScratch->cbData = pHash->inputBlockSize; - if( todo == pOp->cbBuffer ) - { - // - // We finished the request after the pbData processing - // - pScratch->next = pOp->next; - // pScratch->processingState = STATE_NEXT // already has that value - } else { - pScratch->processingState = STATE_DATA_START; - SYMCRYPT_ASSERT( todo <= 0xff ); - pScratch->bytesAlreadyProcessed = (BYTE) todo; - } - - pState->bytesInBuffer = 0; // it will be after we process the block - return TRUE; - } else { - // - // We finished the operation; skip to the next one. - // - pScratch->next = pOp->next; - // pScratch->processingState = STATE_NEXT // already has that value - continue; - } - } else { - // - // Buffer is empty; process the bulk data - // - pScratch->pbData = pOp->pbBuffer; - pScratch->cbData = pOp->cbBuffer; - pScratch->processingState = STATE_DATA_END; - - // - // Return TRUE if there is real data to process, and just re-run the state - // machine if we should copy the partial block to the buffer. - // - if( pScratch->cbData >= pHash->inputBlockSize ) - { - return TRUE; - } else { - continue; - } - } - } else { - SYMCRYPT_ASSERT( pOp->hashOperation == SYMCRYPT_HASH_OPERATION_RESULT ); - - if( (*pParHash->parResult1Func)( pParHash, pState, pScratch, &res ) ) - { - return res; - } - } - break; - - case STATE_DATA_START: - // - // The next operation is an append, and the first few bytes of that operation have already been copied to - // the buffer and processed. We need to process the rest. - // Note that the # bytes remaining is never zero. - // - SYMCRYPT_ASSERT( pOp->hashOperation == SYMCRYPT_HASH_OPERATION_APPEND && pOp->cbBuffer >= pScratch->bytesAlreadyProcessed ); - - pScratch->pbData = pOp->pbBuffer + pScratch->bytesAlreadyProcessed; - pScratch->cbData = pOp->cbBuffer - pScratch->bytesAlreadyProcessed; - if( pScratch->cbData >= pHash->inputBlockSize ) - { - pScratch->processingState = STATE_DATA_END; - return TRUE; - } - - // - // We have less than one block left; this is exactly the same state as we have at the end of - // a normal append. Fall through to that code. - // - // FALLTHROUGH! - - case STATE_DATA_END: - // - // We finished processing the whole blocks of the pScratch->pbData, and have to process the rest. - // The current append is already popped off the work list. - // - if( pScratch->cbData > 0 ) - { - SYMCRYPT_ASSERT( pScratch->cbData < pHash->inputBlockSize ); - memcpy( &pState->buffer[0], pScratch->pbData, pScratch->cbData ); - pState->bytesInBuffer = (UINT32) pScratch->cbData; - } - pScratch->next = pOp->next; - pScratch->processingState = STATE_NEXT; - continue; - - case STATE_RESULT2: - if( (*pParHash->parResult2Func)( pParHash, pState, pScratch, &res ) ) - { - return res; - } - continue; - - case STATE_RESULT_DONE: - - (*pParHash->parResultDoneFunc)( pParHash, pState, pOp ); - - pScratch->next = pOp->next; - pScratch->processingState = STATE_NEXT; - continue; - } - } - - return FALSE; -} - - -// -// Comparison function used to sort the work into largest-first order. -// -int SYMCRYPT_CDECL -compareRequestSize( const void * p1, const void * p2 ) -{ - PSYMCRYPT_PARALLEL_HASH_SCRATCH_STATE * pp1 = (PSYMCRYPT_PARALLEL_HASH_SCRATCH_STATE *) p1; - PSYMCRYPT_PARALLEL_HASH_SCRATCH_STATE * pp2 = (PSYMCRYPT_PARALLEL_HASH_SCRATCH_STATE *) p2; - - UINT64 c1 = (*pp1)->bytes; - UINT64 c2 = (*pp2)->bytes; - - // - // This is 'reverse' compare function as we want the largest item first. - // - if( c1 < c2 ) - { - return 1; - } else if( c1 > c2 ) - { - return -1; - } else { - return 0; - } -} - -VOID -SYMCRYPT_CALL -SymCryptParallelHashProcess( - _In_ PCSYMCRYPT_PARALLEL_HASH pParHash, - _Inout_updates_bytes_( nStates * pHash->stateSize ) PVOID pStates, - SIZE_T nStates, - _Inout_updates_( nOperations ) PSYMCRYPT_PARALLEL_HASH_OPERATION pOperations, - SIZE_T nOperations, - _Out_writes_( cbScratch ) PBYTE pbScratch, - SIZE_T cbScratch, - UINT32 maxParallel ) -{ - PSYMCRYPT_PARALLEL_HASH_SCRATCH_STATE pScratchState; - PSYMCRYPT_PARALLEL_HASH_SCRATCH_STATE * pWork; - SIZE_T nWork; - PSYMCRYPT_PARALLEL_HASH_OPERATION pOp; - PSYMCRYPT_PARALLEL_HASH_SCRATCH_STATE pSc; - SIZE_T i; - UINT64 singleSize; - BOOLEAN sameSize; - SIZE_T nPar; - PSYMCRYPT_PARALLEL_HASH_SCRATCH_STATE * pNextWork; - SIZE_T todo; - SIZE_T nBytes; - PBYTE pbScratchEnd; - PBYTE pbFixedScratch; - SIZE_T cbFixedScratch; - PCSYMCRYPT_HASH pHash; - - if( nOperations == 0 ) - { - return; - } - - pHash = pParHash->pHash; - - // - // The caller passes us a scratch buffer. We split that into the following pieces: - // - // - // SYMCRYPT_PARALLEL_HASH_SCRATCH pScratchState[ nStates ] - // PSYMCRYPT_PARALLEL_HASH_SCRATCH pWork[ nStates ] - // - // scratch space for parallel function - // - - pbScratchEnd = pbScratch + cbScratch; - pScratchState = (PSYMCRYPT_PARALLEL_HASH_SCRATCH_STATE) SYMCRYPT_ALIGN_UP( pbScratch ); - pWork = (PSYMCRYPT_PARALLEL_HASH_SCRATCH_STATE *) (pScratchState + nStates); - pbFixedScratch = (PBYTE)((((UINT_PTR)(pWork + nStates)) + SYMCRYPT_SIMD_ELEMENT_SIZE - 1) & ~(SYMCRYPT_SIMD_ELEMENT_SIZE - 1)); - cbFixedScratch = pParHash->parScratchFixed; - - if( pbFixedScratch + cbFixedScratch > pbScratchEnd ) - { - SymCryptFatal( 'ps2.' ); - } - - // - // Wipe the scratch state; this sets the pointers to NULL, and the byte counts to 0. - // - memset( pScratchState, 0, nStates * sizeof( *pScratchState )); - nWork = 0; - - // - // The general data structure is as follows. - // For each hash state, we keep our administration in the pScratchState[i]. This contains a pointer to the actual - // hash state, a pointer to a linked list of operations to be performed on this state, pointer/length of the - // current data to be processed, and a few more administrative items. - // We also keep the pWork array of pointers to our scratch states, which contains all the states that still need - // work to be done. - // - // We process over the operations in reverse order to make it easy to build a forward single-linked list - // - pOp = &pOperations[ nOperations ]; - while( pOp > pOperations ) - { - pOp--; - - if( pOp->iHash >= nStates ) - { - SymCryptFatal( 'ps2x' ); - } - - pSc = &pScratchState[ pOp->iHash ]; - - if( pSc->hashState == NULL ) - { - // - // We found a new state that is being modified by this set of operations. - // Set the pointer to the hash state, and add it to the work list. - // - SYMCRYPT_ASSERT( nWork < nStates ); - pSc->hashState = (PBYTE) pStates + pHash->stateSize * pOp->iHash; - pWork[nWork] = pSc; - nWork++; - } - - // - // We estimate how much work we have to do on each state, so that we can start on the largest ones - // and be more efficient. - // - if( pOp->hashOperation == SYMCRYPT_HASH_OPERATION_APPEND ) - { - pSc->bytes += pOp->cbBuffer; - } else if( pOp->hashOperation == SYMCRYPT_HASH_OPERATION_RESULT ) - { - // - // The result could be a 1 or 2-block operation; but it is mostly a 1-block one so that is what we budget. - // - pSc->bytes += pHash->inputBlockSize; - } else { - SymCryptFatal( 'ps2y' ); - } - - // - // Add the operation to the list of operations for this state - // - pOp->next = pSc->next; - pSc->next = pOp; - } - - // - // We have built all the structures. - // Run the SetNextWork on each of them, and drop the ones that don't have work. - // Also detect whether they are all the same size so that we can avoid the sorting cost. - // - SYMCRYPT_ASSERT( nWork > 0 ); - singleSize = (*pWork)->bytes; - sameSize = TRUE; - i = 0; - while( i < nWork ) - { - if( !SymCryptParallelHashSetNextWork( pParHash, pWork[i] ) ) - { - pWork[i] = pWork[nWork-1]; - nWork--; - continue; - } - - if( pWork[i]->bytes != singleSize ) - { - sameSize = FALSE; - } - i++; - } - - if( !sameSize ) - { - qsort( pWork, nWork, sizeof( *pWork ), &compareRequestSize ); - } - - nPar = SYMCRYPT_MIN( nWork, maxParallel ); // # parallel states we currently work on - pNextWork = pWork + nPar; // next work pointer. - - while( nWork > 0 ) - { - todo = pWork[0]->cbData; - for( i=1; icbData ); - } - - nBytes = todo & ~(pHash->inputBlockSize - 1 ); - - (*pParHash->parAppendFunc)( pWork, nPar, nBytes, pbFixedScratch, cbFixedScratch ); - - for( i=0; icbData < pHash->inputBlockSize ) - { - // - // Once we start a request we finish it; this is not optimal. - // It would be better to switch things around a bit, but that is much more complicated. - // Example: suppose we can do 4-parallel and have requests of size - // 9 8 7 6 6 6 - // Our code does - // Process first 4 of # blocks Resulting state - // 9 8 7 6 / 6 6 6 3 2 1 - / 6 6 - // 6 3 2 1 / 6 1 5 2 1 0 / 6 - // 6 more to finish for a total of 13 blocks. - // - // Better would be: - // Process first 4 of # blocks Resulting state - // 9 8 7 6 / 6 6 6 3 2 1 - / 6 6 - // 6 6 3 2 / 1 - 2 4 4 1 0 / 1 - // 4 more to finish for total of 12 blocks. - // - // Or even better: - // Process first 4 of # blocks Resulting state - // 9 8 7 6 / 6 6 5 4 3 2 1 / 6 6 - // 6 6 4 3 / 2 1 3 3 3 1 - / 2 1 - // 3 3 2 1 / 1 - 1 2 2 1 - / 1 - - // 2 more to finish for a total of 11 blocks. - // Note that this last one requires the interruption of a started hash computation. - // - - if( !SymCryptParallelHashSetNextWork( pParHash, pWork[i] )) - { - if( nWork > nPar ) - { - pWork[i] = *pNextWork++; - nWork--; - } else { - // - // Ugly: copy the last item here, and wind back the loop counter - // by one so that we will process the last item again. - // - pWork[i] = pWork[ --nPar ]; - i--; - nWork--; - } - } - } - } - } - SymCryptWipe( pbFixedScratch, cbFixedScratch ); -} +// +// ParHash.c +// Code shared with all the parallel hash implementations +// +// Copyright (c) Microsoft Corporation. Licensed under the MIT license. +// + +#include "precomp.h" + +VOID +SYMCRYPT_CALL +SymCryptParallelHashProcess_serial( + _In_ PCSYMCRYPT_PARALLEL_HASH pParHash, + _Inout_updates_bytes_( nStates * pHash->stateSize ) PVOID pStates, + SIZE_T nStates, + _Inout_updates_( nOperations ) PSYMCRYPT_PARALLEL_HASH_OPERATION pOperations, + SIZE_T nOperations, + _Out_writes_( cbScratch ) PBYTE pbScratch, + SIZE_T cbScratch ) +{ + SIZE_T i; + PSYMCRYPT_PARALLEL_HASH_OPERATION op; + PCSYMCRYPT_HASH pHash; + + pHash = pParHash->pHash; + op = pOperations; + + // + // Wipe the scratch space to detect erroneous callers. + // We do this so that callers that test on a non-parallel platform will work on a platform that does support + // parallel operations. + // + if( cbScratch < pParHash->parScratchFixed + nStates * SYMCRYPT_PARALLEL_HASH_PER_STATE_SCRATCH ) + { + SymCryptFatal( 'ps2s' ); + } + SymCryptWipeKnownSize( pbScratch, pParHash->parScratchFixed + nStates * SYMCRYPT_PARALLEL_HASH_PER_STATE_SCRATCH ); + + for( i=0; iiHash >= nStates ) + { + SymCryptFatal( 'ps2i' ); + return; // prefast doesn't understand that SymCryptFatal doesn't return despite the annotation + } + switch( op->hashOperation ) + { + case SYMCRYPT_HASH_OPERATION_APPEND: + (*pHash->appendFunc)( (PBYTE)pStates + pHash->stateSize * op->iHash, op->pbBuffer, op->cbBuffer ); + break; + + case SYMCRYPT_HASH_OPERATION_RESULT: + if( op->cbBuffer != pHash->resultSize ) + { + SymCryptFatal( 'ps2r' ); + } + (*pHash->resultFunc)( (PBYTE)pStates + pHash->stateSize * op->iHash, op->pbBuffer ); + break; + + default: + SymCryptFatal( 'ps2o'); + } + op++; + } +} + +// +// This function looks at a state and decides what to do. +// If it returns FALSE, then this state is done and no further processing is required. +// If it returns TRUE, the pbData/cbData have to be processed in parallel. +// This function is called again on the same state after the pbData/cbData have been processed. +// +// Internally, it keeps track of the next step to be taken for this state. +// the processingState keeps track of the next action to take. +// + +// +// An enum to keep track of the state of a request block +// +BOOLEAN +SYMCRYPT_CALL +SymCryptParallelHashSetNextWork( PCSYMCRYPT_PARALLEL_HASH pParHash, PSYMCRYPT_PARALLEL_HASH_SCRATCH_STATE pScratch ) +{ + PSYMCRYPT_COMMON_HASH_STATE pState; + PCSYMCRYPT_HASH pHash; + PCSYMRYPT_PARALLEL_HASH_OPERATION pOp; + SIZE_T bytesInBuffer; + SIZE_T todo; + BOOLEAN res; + + // Retrieve the state we will operate on. + pState = (PSYMCRYPT_COMMON_HASH_STATE) pScratch->hashState; + pHash = pParHash->pHash; + + // + // This is a state machine where some states have to iterate + // The loop allows them to use 'continue' for that. + // +#pragma warning( suppress: 4127 ) // conditional expression is constant + while( TRUE ) + { + // + // At this point, the processing state, pbData/cbData, and next pointer define what needs to be done. + // STATE_NEXT: cbData == 0 and we have to process the remaining operations. + // STATE_DATA_START: We are working on the next operation; the first BytesAlreadyProcessed have been hashed, + // and the hash state has an empty buffer. + // STATE_DATA_END: We are working on the next operation (an append), and pbData/cbData have whatever partial block remains + // after all the whole blocks have been processed. + // STATE_PAD2: We are working on the next operation (a result), and have processed the first half of a 2-block padding. + // STATE_RESULT: We are working on the next operation (a result), and have processed all the padding. + // + // The pState->dataLength is updated whenever we copy bytes from the append into the state's buffer, or when + // we return TRUE and process bulk data. + // + pOp = pScratch->next; + switch( pScratch->processingState ) + { + case STATE_NEXT: + + if( pOp == NULL ) + { + return FALSE; + } + + bytesInBuffer = pState->bytesInBuffer; + + // SYMCRYPT_ASSERT( pOp->cbBuffer < ((SIZE_T)-1)/2 ); // used during testing + + if( pOp->hashOperation == SYMCRYPT_HASH_OPERATION_APPEND ) + { + pState->dataLengthL += pOp->cbBuffer; + if( pState->dataLengthL < pOp->cbBuffer ) { + pState->dataLengthH ++; // This is almost-unreachable code as it requires 2^64 bytes to be hashed. + } + + if( bytesInBuffer > 0 ) + { + SYMCRYPT_ASSERT( pHash->inputBlockSize > bytesInBuffer ); + + todo = SYMCRYPT_MIN( pHash->inputBlockSize - bytesInBuffer, pOp->cbBuffer ); + memcpy( &pState->buffer[bytesInBuffer], pOp->pbBuffer, todo ); + pState->bytesInBuffer += (UINT32) todo; + if( pState->bytesInBuffer == pHash->inputBlockSize ) + { + // + // We filled the buffer; set it for processing. + // Remember the # bytes we did and set the next state to process the rest of the request. + // + pScratch->pbData = &pState->buffer[0]; + pScratch->cbData = pHash->inputBlockSize; + if( todo == pOp->cbBuffer ) + { + // + // We finished the request after the pbData processing + // + pScratch->next = pOp->next; + // pScratch->processingState = STATE_NEXT // already has that value + } else { + pScratch->processingState = STATE_DATA_START; + SYMCRYPT_ASSERT( todo <= 0xff ); + pScratch->bytesAlreadyProcessed = (BYTE) todo; + } + + pState->bytesInBuffer = 0; // it will be after we process the block + return TRUE; + } else { + // + // We finished the operation; skip to the next one. + // + pScratch->next = pOp->next; + // pScratch->processingState = STATE_NEXT // already has that value + continue; + } + } else { + // + // Buffer is empty; process the bulk data + // + pScratch->pbData = pOp->pbBuffer; + pScratch->cbData = pOp->cbBuffer; + pScratch->processingState = STATE_DATA_END; + + // + // Return TRUE if there is real data to process, and just re-run the state + // machine if we should copy the partial block to the buffer. + // + if( pScratch->cbData >= pHash->inputBlockSize ) + { + return TRUE; + } else { + continue; + } + } + } else { + SYMCRYPT_ASSERT( pOp->hashOperation == SYMCRYPT_HASH_OPERATION_RESULT ); + + if( (*pParHash->parResult1Func)( pParHash, pState, pScratch, &res ) ) + { + return res; + } + } + break; + + case STATE_DATA_START: + // + // The next operation is an append, and the first few bytes of that operation have already been copied to + // the buffer and processed. We need to process the rest. + // Note that the # bytes remaining is never zero. + // + SYMCRYPT_ASSERT( pOp->hashOperation == SYMCRYPT_HASH_OPERATION_APPEND && pOp->cbBuffer >= pScratch->bytesAlreadyProcessed ); + + pScratch->pbData = pOp->pbBuffer + pScratch->bytesAlreadyProcessed; + pScratch->cbData = pOp->cbBuffer - pScratch->bytesAlreadyProcessed; + if( pScratch->cbData >= pHash->inputBlockSize ) + { + pScratch->processingState = STATE_DATA_END; + return TRUE; + } + + // + // We have less than one block left; this is exactly the same state as we have at the end of + // a normal append. Fall through to that code. + // + // FALLTHROUGH! + + case STATE_DATA_END: + // + // We finished processing the whole blocks of the pScratch->pbData, and have to process the rest. + // The current append is already popped off the work list. + // + if( pScratch->cbData > 0 ) + { + SYMCRYPT_ASSERT( pScratch->cbData < pHash->inputBlockSize ); + memcpy( &pState->buffer[0], pScratch->pbData, pScratch->cbData ); + pState->bytesInBuffer = (UINT32) pScratch->cbData; + } + pScratch->next = pOp->next; + pScratch->processingState = STATE_NEXT; + continue; + + case STATE_RESULT2: + if( (*pParHash->parResult2Func)( pParHash, pState, pScratch, &res ) ) + { + return res; + } + continue; + + case STATE_RESULT_DONE: + + (*pParHash->parResultDoneFunc)( pParHash, pState, pOp ); + + pScratch->next = pOp->next; + pScratch->processingState = STATE_NEXT; + continue; + } + } + + return FALSE; +} + + +// +// Comparison function used to sort the work into largest-first order. +// +int SYMCRYPT_CDECL +compareRequestSize( const void * p1, const void * p2 ) +{ + PSYMCRYPT_PARALLEL_HASH_SCRATCH_STATE * pp1 = (PSYMCRYPT_PARALLEL_HASH_SCRATCH_STATE *) p1; + PSYMCRYPT_PARALLEL_HASH_SCRATCH_STATE * pp2 = (PSYMCRYPT_PARALLEL_HASH_SCRATCH_STATE *) p2; + + UINT64 c1 = (*pp1)->bytes; + UINT64 c2 = (*pp2)->bytes; + + // + // This is 'reverse' compare function as we want the largest item first. + // + if( c1 < c2 ) + { + return 1; + } else if( c1 > c2 ) + { + return -1; + } else { + return 0; + } +} + +VOID +SYMCRYPT_CALL +SymCryptParallelHashProcess( + _In_ PCSYMCRYPT_PARALLEL_HASH pParHash, + _Inout_updates_bytes_( nStates * pHash->stateSize ) PVOID pStates, + SIZE_T nStates, + _Inout_updates_( nOperations ) PSYMCRYPT_PARALLEL_HASH_OPERATION pOperations, + SIZE_T nOperations, + _Out_writes_( cbScratch ) PBYTE pbScratch, + SIZE_T cbScratch, + UINT32 maxParallel ) +{ + PSYMCRYPT_PARALLEL_HASH_SCRATCH_STATE pScratchState; + PSYMCRYPT_PARALLEL_HASH_SCRATCH_STATE * pWork; + SIZE_T nWork; + PSYMCRYPT_PARALLEL_HASH_OPERATION pOp; + PSYMCRYPT_PARALLEL_HASH_SCRATCH_STATE pSc; + SIZE_T i; + UINT64 singleSize; + BOOLEAN sameSize; + SIZE_T nPar; + PSYMCRYPT_PARALLEL_HASH_SCRATCH_STATE * pNextWork; + SIZE_T todo; + SIZE_T nBytes; + PBYTE pbScratchEnd; + PBYTE pbFixedScratch; + SIZE_T cbFixedScratch; + PCSYMCRYPT_HASH pHash; + + if( nOperations == 0 ) + { + return; + } + + pHash = pParHash->pHash; + + // + // The caller passes us a scratch buffer. We split that into the following pieces: + // + // + // SYMCRYPT_PARALLEL_HASH_SCRATCH pScratchState[ nStates ] + // PSYMCRYPT_PARALLEL_HASH_SCRATCH pWork[ nStates ] + // + // scratch space for parallel function + // + + pbScratchEnd = pbScratch + cbScratch; + pScratchState = (PSYMCRYPT_PARALLEL_HASH_SCRATCH_STATE) SYMCRYPT_ALIGN_UP( pbScratch ); + pWork = (PSYMCRYPT_PARALLEL_HASH_SCRATCH_STATE *) (pScratchState + nStates); + pbFixedScratch = (PBYTE)((((UINT_PTR)(pWork + nStates)) + SYMCRYPT_SIMD_ELEMENT_SIZE - 1) & ~(SYMCRYPT_SIMD_ELEMENT_SIZE - 1)); + cbFixedScratch = pParHash->parScratchFixed; + + if( pbFixedScratch + cbFixedScratch > pbScratchEnd ) + { + SymCryptFatal( 'ps2.' ); + } + + // + // Wipe the scratch state; this sets the pointers to NULL, and the byte counts to 0. + // + memset( pScratchState, 0, nStates * sizeof( *pScratchState )); + nWork = 0; + + // + // The general data structure is as follows. + // For each hash state, we keep our administration in the pScratchState[i]. This contains a pointer to the actual + // hash state, a pointer to a linked list of operations to be performed on this state, pointer/length of the + // current data to be processed, and a few more administrative items. + // We also keep the pWork array of pointers to our scratch states, which contains all the states that still need + // work to be done. + // + // We process over the operations in reverse order to make it easy to build a forward single-linked list + // + pOp = &pOperations[ nOperations ]; + while( pOp > pOperations ) + { + pOp--; + + if( pOp->iHash >= nStates ) + { + SymCryptFatal( 'ps2x' ); + } + + pSc = &pScratchState[ pOp->iHash ]; + + if( pSc->hashState == NULL ) + { + // + // We found a new state that is being modified by this set of operations. + // Set the pointer to the hash state, and add it to the work list. + // + SYMCRYPT_ASSERT( nWork < nStates ); + pSc->hashState = (PBYTE) pStates + pHash->stateSize * pOp->iHash; + pWork[nWork] = pSc; + nWork++; + } + + // + // We estimate how much work we have to do on each state, so that we can start on the largest ones + // and be more efficient. + // + if( pOp->hashOperation == SYMCRYPT_HASH_OPERATION_APPEND ) + { + pSc->bytes += pOp->cbBuffer; + } else if( pOp->hashOperation == SYMCRYPT_HASH_OPERATION_RESULT ) + { + // + // The result could be a 1 or 2-block operation; but it is mostly a 1-block one so that is what we budget. + // + pSc->bytes += pHash->inputBlockSize; + } else { + SymCryptFatal( 'ps2y' ); + } + + // + // Add the operation to the list of operations for this state + // + pOp->next = pSc->next; + pSc->next = pOp; + } + + // + // We have built all the structures. + // Run the SetNextWork on each of them, and drop the ones that don't have work. + // Also detect whether they are all the same size so that we can avoid the sorting cost. + // + SYMCRYPT_ASSERT( nWork > 0 ); + singleSize = (*pWork)->bytes; + sameSize = TRUE; + i = 0; + while( i < nWork ) + { + if( !SymCryptParallelHashSetNextWork( pParHash, pWork[i] ) ) + { + pWork[i] = pWork[nWork-1]; + nWork--; + continue; + } + + if( pWork[i]->bytes != singleSize ) + { + sameSize = FALSE; + } + i++; + } + + if( !sameSize ) + { + qsort( pWork, nWork, sizeof( *pWork ), &compareRequestSize ); + } + + nPar = SYMCRYPT_MIN( nWork, maxParallel ); // # parallel states we currently work on + pNextWork = pWork + nPar; // next work pointer. + + while( nWork > 0 ) + { + todo = pWork[0]->cbData; + for( i=1; icbData ); + } + + nBytes = todo & ~(pHash->inputBlockSize - 1 ); + + (*pParHash->parAppendFunc)( pWork, nPar, nBytes, pbFixedScratch, cbFixedScratch ); + + for( i=0; icbData < pHash->inputBlockSize ) + { + // + // Once we start a request we finish it; this is not optimal. + // It would be better to switch things around a bit, but that is much more complicated. + // Example: suppose we can do 4-parallel and have requests of size + // 9 8 7 6 6 6 + // Our code does + // Process first 4 of # blocks Resulting state + // 9 8 7 6 / 6 6 6 3 2 1 - / 6 6 + // 6 3 2 1 / 6 1 5 2 1 0 / 6 + // 6 more to finish for a total of 13 blocks. + // + // Better would be: + // Process first 4 of # blocks Resulting state + // 9 8 7 6 / 6 6 6 3 2 1 - / 6 6 + // 6 6 3 2 / 1 - 2 4 4 1 0 / 1 + // 4 more to finish for total of 12 blocks. + // + // Or even better: + // Process first 4 of # blocks Resulting state + // 9 8 7 6 / 6 6 5 4 3 2 1 / 6 6 + // 6 6 4 3 / 2 1 3 3 3 1 - / 2 1 + // 3 3 2 1 / 1 - 1 2 2 1 - / 1 - + // 2 more to finish for a total of 11 blocks. + // Note that this last one requires the interruption of a started hash computation. + // + + if( !SymCryptParallelHashSetNextWork( pParHash, pWork[i] )) + { + if( nWork > nPar ) + { + pWork[i] = *pNextWork++; + nWork--; + } else { + // + // Ugly: copy the last item here, and wind back the loop counter + // by one so that we will process the last item again. + // + pWork[i] = pWork[ --nPar ]; + i--; + nWork--; + } + } + } + } + } + SymCryptWipe( pbFixedScratch, cbFixedScratch ); +} diff --git a/lib/sha256Par.c b/lib/sha256Par.c index a586b9a..a37e493 100644 --- a/lib/sha256Par.c +++ b/lib/sha256Par.c @@ -1,1474 +1,1474 @@ -// -// Sha256Par.c -// -// Copyright (c) Microsoft Corporation. Licensed under the MIT license. -// - -// -// This module contains the routines to implement SHA2-256 from FIPS 180-2 in parallel mode -// - -#include "precomp.h" - -extern SYMCRYPT_ALIGN_AT( 256 ) const UINT32 SymCryptSha256K[64]; - -#define PAR_SCRATCH_ELEMENTS (4+8+64) // # scratch elements our parallel impementations need - - - -// -// Not all CPU architectures support parallel code. -// -#if SYMCRYPT_CPU_X86 | SYMCRYPT_CPU_AMD64 - -#define SUPPORT_PARALLEL 1 -#define MIN_PARALLEL 2 -#define MAX_PARALLEL 8 - -#elif SYMCRYPT_CPU_ARM - -#define SUPPORT_PARALLEL 1 -#define MIN_PARALLEL 3 -#define MAX_PARALLEL 4 - -#else - -#define SUPPORT_PARALLEL 0 - -#endif - - -VOID -SYMCRYPT_CALL -SymCryptParallelSha256AppendBytes_serial( - _Inout_updates_( nPar ) PSYMCRYPT_PARALLEL_HASH_SCRATCH_STATE * pWork, - _In_range_(1, MAX_PARALLEL) SIZE_T nPar, - SIZE_T nBytes ); - -// -// Currently these are the generic implementations in terms of the single hash code. -// - -VOID -SYMCRYPT_CALL -SymCryptParallelSha256Init( - _Out_writes_( nStates ) PSYMCRYPT_SHA256_STATE pStates, - SIZE_T nStates ) -{ - SIZE_T i; - - for( i=0; ibytesInBuffer; - - UNREFERENCED_PARAMETER( pParHash ); - // - // Function is called when a Result is requested from a parallel hashs state. - // Do the first step of the padding. - // - pState->buffer[bytesInBuffer++] = 0x80; - SymCryptWipe( &pState->buffer[bytesInBuffer], SYMCRYPT_SHA256_INPUT_BLOCK_SIZE - bytesInBuffer ); - - pScratch->pbData = &pState->buffer[0]; - pScratch->cbData = SYMCRYPT_SHA256_INPUT_BLOCK_SIZE; - - if( bytesInBuffer > SYMCRYPT_SHA256_INPUT_BLOCK_SIZE - 8 ) - { - // We need 2 blocks for the padding - pScratch->processingState = STATE_RESULT2; - } else { - SYMCRYPT_STORE_MSBFIRST64( &pState->buffer[SYMCRYPT_SHA256_INPUT_BLOCK_SIZE - 8], pState->dataLengthL * 8 ); - pScratch->processingState = STATE_RESULT_DONE; - } - - *pRes = TRUE; // return value from the SetWork function - return TRUE; // Return from the SetWork function -} - - -BOOLEAN -SYMCRYPT_CALL -SymCryptParallelSha256Result2( - _In_ PCSYMCRYPT_PARALLEL_HASH pParHash, - _Inout_ PSYMCRYPT_COMMON_HASH_STATE pState, - _Inout_ PSYMCRYPT_PARALLEL_HASH_SCRATCH_STATE pScratch, - _Out_ BOOLEAN *pRes) -{ - UNREFERENCED_PARAMETER( pParHash ); - // - // Called for the 2nd block of a long padding - // - SymCryptWipe( &pState->buffer[0], SYMCRYPT_SHA256_INPUT_BLOCK_SIZE ); - SYMCRYPT_STORE_MSBFIRST64( &pState->buffer[SYMCRYPT_SHA256_INPUT_BLOCK_SIZE - 8], pState->dataLengthL * 8 ); - pScratch->pbData = &pState->buffer[0]; - pScratch->cbData = SYMCRYPT_SHA256_INPUT_BLOCK_SIZE; - pScratch->processingState = STATE_RESULT_DONE; - *pRes = TRUE; - return TRUE; -} - -VOID -SYMCRYPT_CALL -SymCryptParallelSha256ResultDone( - _In_ PCSYMCRYPT_PARALLEL_HASH pParHash, - _Inout_ PSYMCRYPT_COMMON_HASH_STATE pState, - _In_ PCSYMRYPT_PARALLEL_HASH_OPERATION pOp) -{ - PSYMCRYPT_SHA256_STATE pSha256State = (PSYMCRYPT_SHA256_STATE) pState; - - UNREFERENCED_PARAMETER( pParHash ); - - SYMCRYPT_ASSERT( pOp->hashOperation == SYMCRYPT_HASH_OPERATION_RESULT ); - SYMCRYPT_ASSERT( pOp->cbBuffer == SYMCRYPT_SHA256_RESULT_SIZE ); - - SymCryptUint32ToMsbFirst( &pSha256State->chain.H[0], pOp->pbBuffer, 8 ); - SymCryptWipeKnownSize( pSha256State, sizeof( *pSha256State )); - SymCryptSha256Init( pSha256State ); -} - - -#if 0 - -BOOL -SYMCRYPT_CALL -SymCryptParallelSha256SetNextWork( PSYMCRYPT_PARALLEL_HASH_SCRATCH_STATE pScratch ) -{ - PSYMCRYPT_SHA256_STATE pState; - PCSYMRYPT_PARALLEL_HASH_OPERATION pOp; - UINT32 bytesInBuffer; - UINT32 todo; - - // Retrieve the state we will operate on. - pState = (PSYMCRYPT_SHA256_STATE) pScratch->hashState; - - // - // This is a state machine where some states have to iterate - // The loop allows them to use 'continue' for that. - // -#pragma warning( suppress: 4127 ) // conditional expression is constant - while( TRUE ) - { - // - // At this point, the processing state, pbData/cbData, and next pointer define what needs to be done. - // STATE_NEXT: cbData == 0 and we have to process the remaining operations. - // STATE_DATA_START: We are working on the next operation; the first BytesAlreadyProcessed have been hashed, - // and the hash state has an empty buffer. - // STATE_DATA_END: We are working on the next operation (an append), and pbData/cbData have whatever partial block remains - // after all the whole blocks have been processed. - // STATE_PAD2: We are working on the next operation (a result), and have processed the first half of a 2-block padding. - // STATE_RESULT: We are working on the next operation (a result), and have processed all the padding. - // - // The pState->dataLength is updated whenver we copy bytes from the append into the state's buffer, or when - // we return TRUE and process bulk data. - // - pOp = pScratch->next; - switch( pScratch->processingState ) - { - case STATE_NEXT: - - if( pOp == NULL ) - { - return FALSE; - } - - bytesInBuffer = pState->bytesInBuffer; - - // SYMCRYPT_ASSERT( pOp->cbBuffer < ((SIZE_T)-1)/2 ); // used during testing - - if( pOp->hashOperation == SYMCRYPT_HASH_OPERATION_APPEND ) - { - pState->dataLengthL += pOp->cbBuffer; - if( bytesInBuffer > 0 ) - { - todo = (UINT32) SYMCRYPT_MIN( SYMCRYPT_SHA256_INPUT_BLOCK_SIZE - bytesInBuffer, pOp->cbBuffer ); - memcpy( &pState->buffer[bytesInBuffer], pOp->pbBuffer, todo ); - pState->bytesInBuffer += todo; - if( pState->bytesInBuffer == SYMCRYPT_SHA256_INPUT_BLOCK_SIZE ) - { - // - // We filled the buffer; set it for processing. - // Remember the # bytes we did and set the next state to process the rest of the request. - // - pScratch->pbData = &pState->buffer[0]; - pScratch->cbData = sizeof( pState->buffer ); - pState->bytesInBuffer = 0; - if( todo == pOp->cbBuffer ) - { - // - // We finished the request after the pbData processing - // - pScratch->next = pOp->next; - // pScratch->processingState = STATE_NEXT // already has that value - } else { - pScratch->processingState = STATE_DATA_START; - SYMCRYPT_ASSERT( todo <= 0xff ); - pScratch->bytesAlreadyProcessed = (BYTE) todo; - } - // - // We process the buffer here, no need to update the dataLength - // - return TRUE; - } else { - // - // We finished the operation; skip to the next one. - // - pScratch->next = pOp->next; - // pScratch->processingState = STATE_NEXT // already has that value - continue; - } - } else { - // - // Buffer is empty; process the bulk data - // - pScratch->pbData = pOp->pbBuffer; - pScratch->cbData = pOp->cbBuffer; - pScratch->processingState = STATE_DATA_END; - - // - // Return TRUE if there is real data to process, and just re-run the state - // machine if we should copy the partial block to the buffer. - // - if( pScratch->cbData >= SYMCRYPT_SHA256_INPUT_BLOCK_SIZE ) - { - return TRUE; - } else { - continue; - } - } - } else { - SYMCRYPT_ASSERT( pOp->hashOperation == SYMCRYPT_HASH_OPERATION_RESULT ); - - pState->buffer[bytesInBuffer++] = 0x80; - SymCryptWipe( &pState->buffer[bytesInBuffer], SYMCRYPT_SHA256_INPUT_BLOCK_SIZE - bytesInBuffer ); - - pScratch->pbData = &pState->buffer[0]; - pScratch->cbData = sizeof( pState->buffer ); - - if( bytesInBuffer > SYMCRYPT_SHA256_INPUT_BLOCK_SIZE - 8 ) - { - // We need 2 blocks for the padding - pScratch->processingState = STATE_PAD2; - } else { - SYMCRYPT_STORE_MSBFIRST64( &pState->buffer[SYMCRYPT_SHA256_INPUT_BLOCK_SIZE - 8], pState->dataLengthL * 8 ); - pScratch->processingState = STATE_RESULT; - } - return TRUE; - } - break; - - case STATE_DATA_START: - // - // The next operation is an append, and the first few bytes of that operation have already been copied to - // the buffer and processed. We need to process the rest. - // Note that the # bytes remaining is never zero. - // - SYMCRYPT_ASSERT( pOp->hashOperation == SYMCRYPT_HASH_OPERATION_APPEND && pOp->cbBuffer >= pScratch->bytesAlreadyProcessed ); - - pScratch->pbData = pOp->pbBuffer + pScratch->bytesAlreadyProcessed; - pScratch->cbData = pOp->cbBuffer - pScratch->bytesAlreadyProcessed; - if( pScratch->cbData >= SYMCRYPT_SHA256_INPUT_BLOCK_SIZE ) - { - pScratch->processingState = STATE_DATA_END; - return TRUE; - } - - // - // We have less than one block left; this is exactly the same state as we have at the end of - // a normal append. Fall through to that code. - // - // FALLTHROUGH! - - case STATE_DATA_END: - // - // We finished processing the whole blocks of the pScratch->pbData, and have to process the rest. - // The current append is already popped off the work list. - // - if( pScratch->cbData > 0 ) - { - SYMCRYPT_ASSERT( pScratch->cbData < SYMCRYPT_SHA256_INPUT_BLOCK_SIZE ); - memcpy( &pState->buffer[0], pScratch->pbData, pScratch->cbData ); - pState->bytesInBuffer = (UINT32) pScratch->cbData; - } - pScratch->next = pOp->next; - pScratch->processingState = STATE_NEXT; - continue; - - case STATE_PAD2: - SymCryptWipe( &pState->buffer[0], sizeof( pState->buffer )); - SYMCRYPT_STORE_MSBFIRST64( &pState->buffer[SYMCRYPT_SHA256_INPUT_BLOCK_SIZE - 8], pState->dataLengthL * 8 ); - pScratch->pbData = &pState->buffer[0]; - pScratch->cbData = sizeof( pState->buffer ); - pScratch->processingState = STATE_RESULT; - return TRUE; - - case STATE_RESULT: - SYMCRYPT_ASSERT( pOp->hashOperation == SYMCRYPT_HASH_OPERATION_RESULT ); - - SymCryptUint32ToMsbFirst( &pState->chain.H[0], pOp->pbBuffer, 8 ); - SymCryptWipeKnownSize( pState, sizeof( *pState )); - SymCryptSha256Init( pState ); - - pScratch->next = pOp->next; - pScratch->processingState = STATE_NEXT; - continue; - } - } - -#if 0 // old code, retain until we have the new one working. - ============ old code - - - SIZE_T bytesInBuffer; - SIZE_T todo; - - switch( pScratch->processingState ) - { - case START: - - if( pState->pbData != NULL ) - { - bytesInBuffer = pState->internalState.hashState.dataLength & SYMCRYPT_SHA256_INPUT_BLOCK_SIZE - 1; - - // - // There are bytes in the buffer; consume enough input to get rid of them. - // - if( bytesInBuffer > 0 ) - { - todo = SYMCRYPT_MIN( SYMCRYPT_SHA256_INPUT_BLOCK_SIZE - bytesInBuffer, pState->cbData ); - memcpy( &pState->internalState.hashState.buffer[bytesInBuffer], pState->pbData, todo ); - pState->pbData += todo; - pState->cbData -= todo; - pState->internalState.hashState.dataLength += todo; - - // - // We don't parallelize the processing of the first block to get to the whole-block state. - // It would mean we get a 1-size block up front, and that interferes with the sorted schedulign - // we do. This is not a common case, and we document that this is inefficient. - // - if( (pState->internalState.hashState.dataLength & (SYMCRYPT_SHA256_INPUT_BLOCK_SIZE - 1)) == 0 ) - { - SymCryptSha256AppendBlocks( &pState->internalState.hashState.chain, - &pState->internalState.hashState.buffer[0], - SYMCRYPT_SHA256_INPUT_BLOCK_SIZE ); - } - } - - if( pState->cbData >= SYMCRYPT_SHA256_INPUT_BLOCK_SIZE ) - { - // - // We have more bytes to do; this means that the internal buffer is empty. - // Set the data blocks up for processing. We increment the dataLength here - // as that is part of this function, not of the processing code. - // - pState->internalState.processingState = DATA; - pState->internalState.hashState.dataLength += pState->cbData & ~(SYMCRYPT_SHA256_INPUT_BLOCK_SIZE - 1); - return TRUE; - } - - } - - // - // FALL THROUGH TO THE DATA PROCESSING - // - // There are two cases here: - // - the internal buffer is empty and we have between 1 and 63 bytes left to hash. - // - We have no bytes left to hash, but the internal buffer might contain data. - // The first case is exactly what we get after DATA processing. - // The second case is trivially handled by the same code paths as the first one. - // Instead of duplicating the code, - // we fall through to the DATA section. - // - - pState->internalState.processingState = DATA; - - case DATA: - // - // We just finished the data work, or the START code fell through here to handle the - // padding and/or pbResult - // If we just did data processing, the internal buffer is empty. - // If the internal buffer contains data, then cbData == 0. - // - - if( pState->pbData != NULL && pState->cbData > 0 ) - { - SYMCRYPT_ASSERT( pState->cbData < SYMCRYPT_SHA256_INPUT_BLOCK_SIZE ); - memcpy( &pState->internalState.hashState.buffer[0], pState->pbData, pState->cbData ); - pState->internalState.hashState.dataLength += pState->cbData; - } - - // - // This completes the consumption of the pbData. Set it to NULL as per the API spec. - // - - pState->pbData = NULL; - pState->cbData = 0; - - // - // This concludes the data processing. Now let's see if we have to compute the results - // - - if( pState->pbResult == NULL ) - { - return FALSE; - } - - bytesInBuffer = pState->internalState.hashState.dataLength & SYMCRYPT_SHA256_INPUT_BLOCK_SIZE - 1; - - // Add the first byte of padding. (Always fits as the buffer is never left full.) - pState->internalState.hashState.buffer[bytesInBuffer++] = 0x80; - SymCryptWipe( &pState->internalState.hashState.buffer[bytesInBuffer], SYMCRYPT_SHA256_INPUT_BLOCK_SIZE - bytesInBuffer ); - - if( bytesInBuffer > SYMCRYPT_SHA256_INPUT_BLOCK_SIZE - 8 ) - { - // - // We need 2 blocks for the padding. - // - pState->internalState.processingState = PAD_INTERMEDIATE; - pState->pbData = &pState->internalState.hashState.buffer[0]; - pState->cbData = SYMCRYPT_SHA256_INPUT_BLOCK_SIZE; - - return TRUE; - } - - // - // Single padding block - // - SYMCRYPT_STORE_MSBFIRST64( &pState->internalState.hashState.buffer[SYMCRYPT_SHA256_INPUT_BLOCK_SIZE - 8], pState->internalState.hashState.dataLength * 8 ); - pState->internalState.processingState = PAD_FINAL; - pState->pbData = &pState->internalState.hashState.buffer[0]; - pState->cbData = SYMCRYPT_SHA256_INPUT_BLOCK_SIZE; - return TRUE; - - case PAD_INTERMEDIATE: - // - // Done with the intermediate padding, do the final padding. - // We wipe to the end of the buffer, as it is 16-aligned and therefore often faster - // - SymCryptWipe( &pState->internalState.hashState.buffer[0], SYMCRYPT_SHA256_INPUT_BLOCK_SIZE ); - SYMCRYPT_STORE_MSBFIRST64( &pState->internalState.hashState.buffer[SYMCRYPT_SHA256_INPUT_BLOCK_SIZE - 8], pState->internalState.hashState.dataLength * 8 ); - - pState->internalState.processingState = PAD_FINAL; - pState->pbData = &pState->internalState.hashState.buffer[0]; - pState->cbData = SYMCRYPT_SHA256_INPUT_BLOCK_SIZE; - return TRUE; - - case PAD_FINAL: - SymCryptUint32ToMsbFirst( &pState->internalState.hashState.chain.H[0], pState->pbResult, 8 ); - - SymCryptWipeKnownSize( pState, sizeof( *pState ) ); - SymCryptSha256Init( &pState->internalState.hashState ); - SYMCRYPT_SET_MAGIC( &pState->internalState ); - return FALSE; - } -#endif - - SymCryptFatal( 'psha' ); - return FALSE; -} -#endif - -C_ASSERT( (SYMCRYPT_SIMD_ELEMENT_SIZE & (SYMCRYPT_SIMD_ELEMENT_SIZE - 1 )) == 0 ); // check that it is a power of 2 - - -VOID -SYMCRYPT_CALL -SymCryptParallelSha256Process( - _Inout_updates_( nStates ) PSYMCRYPT_SHA256_STATE pStates, - SIZE_T nStates, - _Inout_updates_( nOperations ) PSYMCRYPT_PARALLEL_HASH_OPERATION pOperations, - SIZE_T nOperations, - _Out_writes_( cbScratch ) PBYTE pbScratch, - SIZE_T cbScratch ) -{ - UINT32 maxParallel; - -#if SYMCRYPT_CPU_X86 | SYMCRYPT_CPU_AMD64 - SYMCRYPT_EXTENDED_SAVE_DATA SaveState; - - if( SYMCRYPT_CPU_FEATURES_PRESENT( SYMCRYPT_CPU_FEATURE_AVX2 ) && SymCryptSaveYmm( &SaveState ) == SYMCRYPT_NO_ERROR ) - { - maxParallel = 8; - - SymCryptParallelHashProcess( SymCryptParallelSha256Algorithm, - pStates, - nStates, - pOperations, - nOperations, - pbScratch, - cbScratch, - maxParallel ); - - SymCryptRestoreYmm( &SaveState ); - } else if( SYMCRYPT_CPU_FEATURES_PRESENT( SYMCRYPT_CPU_FEATURE_SSSE3 ) && SymCryptSaveXmm( &SaveState ) == SYMCRYPT_NO_ERROR ) - { - maxParallel = 4; - SymCryptParallelHashProcess( SymCryptParallelSha256Algorithm, - pStates, - nStates, - pOperations, - nOperations, - pbScratch, - cbScratch, - maxParallel ); - SymCryptRestoreXmm( &SaveState ); - } else { - SymCryptParallelHashProcess_serial( SymCryptParallelSha256Algorithm, pStates, nStates, pOperations, nOperations, pbScratch, cbScratch ); - } - -#elif SYMCRYPT_CPU_ARM - maxParallel = MAX_PARALLEL; - - if( SYMCRYPT_CPU_FEATURES_PRESENT( SYMCRYPT_CPU_FEATURE_NEON ) ) - { - SymCryptParallelHashProcess( SymCryptParallelSha256Algorithm, - pStates, - nStates, - pOperations, - nOperations, - pbScratch, - cbScratch, - maxParallel ); - } else { - SymCryptParallelHashProcess_serial( SymCryptParallelSha256Algorithm, pStates, nStates, pOperations, nOperations, pbScratch, cbScratch ); - } -#else - SymCryptParallelHashProcess_serial( SymCryptParallelSha256Algorithm, pStates, nStates, pOperations, nOperations, pbScratch, cbScratch ); -#endif -} - - -#if SYMCRYPT_CPU_X86 | SYMCRYPT_CPU_AMD64 -// -// Code that uses the XMM registers. -// - -#define MAJXMM( x, y, z ) _mm_or_si128( _mm_and_si128( _mm_or_si128( z, y ), x ), _mm_and_si128( z, y )) -#define CHXMM( x, y, z ) _mm_xor_si128( _mm_and_si128( _mm_xor_si128( z, y ), x ), z ) - -#define CSIGMA0XMM( x ) \ - _mm_xor_si128( _mm_xor_si128( _mm_xor_si128( _mm_xor_si128( _mm_xor_si128( \ - _mm_slli_epi32(x,30) , _mm_srli_epi32(x, 2) ),\ - _mm_slli_epi32(x,19) ), _mm_srli_epi32(x, 13) ),\ - _mm_slli_epi32(x,10) ), _mm_srli_epi32(x, 22) ) -#define CSIGMA1XMM( x ) \ - _mm_xor_si128( _mm_xor_si128( _mm_xor_si128( _mm_xor_si128( _mm_xor_si128( \ - _mm_slli_epi32(x,26) , _mm_srli_epi32(x, 6) ),\ - _mm_slli_epi32(x,21) ), _mm_srli_epi32(x, 11) ),\ - _mm_slli_epi32(x,7) ), _mm_srli_epi32(x, 25) ) -#define LSIGMA0XMM( x ) \ - _mm_xor_si128( _mm_xor_si128( _mm_xor_si128( _mm_xor_si128( \ - _mm_slli_epi32(x,25) , _mm_srli_epi32(x, 7) ),\ - _mm_slli_epi32(x,14) ), _mm_srli_epi32(x, 18) ),\ - _mm_srli_epi32(x, 3) ) -#define LSIGMA1XMM( x ) \ - _mm_xor_si128( _mm_xor_si128( _mm_xor_si128( _mm_xor_si128( \ - _mm_slli_epi32(x,15) , _mm_srli_epi32(x, 17) ),\ - _mm_slli_epi32(x,13) ), _mm_srli_epi32(x, 19) ),\ - _mm_srli_epi32(x,10) ) - -// -// Transpose macro, convert S0..S3 into R0..R3; R0 is the lane 0, R3 is lane 3. -// S0 = S00, S01, S02, S03; S1 = S10, S11, S12, S13; S2 = S20, S21, S22, S23; S3 = S30, S31, S32, S33 -// T0 = S00, S10, S01, S11; T1 = S02, S12, S03, S13; T2 = S20, S30, S21, S31; T3 = S22, S32, S23, S33 -// R0 = S00, S10, S20, S30; R1 = S01, S11, S21, S31; R2 = S02, S12, S22, S32; R3 = S03, S13, S23, S33 -// -#define XMM_TRANSPOSE_32( _R0, _R1, _R2, _R3, _S0, _S1, _S2, _S3 ) \ - {\ - __m128i _T0, _T1, _T2, _T3;\ - _T0 = _mm_unpacklo_epi32( _S0, _S1 ); _T1 = _mm_unpackhi_epi32( _S0, _S1 );\ - _T2 = _mm_unpacklo_epi32( _S2, _S3 ); _T3 = _mm_unpackhi_epi32( _S2, _S3 );\ - _R0 = _mm_unpacklo_epi64( _T0, _T2 ); _R1 = _mm_unpackhi_epi64( _T0, _T2 );\ - _R2 = _mm_unpacklo_epi64( _T1, _T3 ); _R3 = _mm_unpackhi_epi64( _T1, _T3 );\ - } - -VOID -SYMCRYPT_CALL -SymCryptParallelSha256AppendBlocks_xmm( - _Inout_updates_( 4 ) PSYMCRYPT_SHA256_CHAINING_STATE * pChain, - _Inout_updates_( 4 ) PCBYTE * ppByte, - SIZE_T nBytes, - _Out_writes_( PAR_SCRATCH_ELEMENTS ) __m128i * pScratch ) -{ - // - // Implementation that uses 4 lanes in the XMM registers - // - __m128i * buf = pScratch; // chaining state concatenated with the expanded input block - __m128i * W = &buf[4 + 8]; // W are the 64 words of the expanded input - __m128i * ha = &buf[4]; // initial state words, in order h, g, ..., b, a - __m128i A, B, C, D, T; - __m128i T0, T1, T2, T3; - const __m128i BYTE_REVERSE_32 = _mm_set_epi8( 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3 ); - int r; - - // - // The chaining state can be unaligned on x86, so we use unalgned loads - // - - T0 = _mm_loadu_si128( (__m128i *)&pChain[0]->H[0] ); - T1 = _mm_loadu_si128( (__m128i *)&pChain[1]->H[0] ); - T2 = _mm_loadu_si128( (__m128i *)&pChain[2]->H[0] ); - T3 = _mm_loadu_si128( (__m128i *)&pChain[3]->H[0] ); - - XMM_TRANSPOSE_32( ha[7], ha[6], ha[5], ha[4], T0, T1, T2, T3 ); - - T0 = _mm_loadu_si128( (__m128i *)&pChain[0]->H[4] ); - T1 = _mm_loadu_si128( (__m128i *)&pChain[1]->H[4] ); - T2 = _mm_loadu_si128( (__m128i *)&pChain[2]->H[4] ); - T3 = _mm_loadu_si128( (__m128i *)&pChain[3]->H[4] ); - - XMM_TRANSPOSE_32( ha[3], ha[2], ha[1], ha[0], T0, T1, T2, T3 ); - - buf[0] = ha[4]; - buf[1] = ha[5]; - buf[2] = ha[6]; - buf[3] = ha[7]; - - while( nBytes >= 64 ) - { - - // - // Capture the input into W[0..15] - // - for( r=0; r<16; r += 4 ) - { - T0 = _mm_shuffle_epi8( _mm_loadu_si128( (__m128i *) ppByte[0] ), BYTE_REVERSE_32 ); ppByte[0] += 16; - T1 = _mm_shuffle_epi8( _mm_loadu_si128( (__m128i *) ppByte[1] ), BYTE_REVERSE_32 ); ppByte[1] += 16; - T2 = _mm_shuffle_epi8( _mm_loadu_si128( (__m128i *) ppByte[2] ), BYTE_REVERSE_32 ); ppByte[2] += 16; - T3 = _mm_shuffle_epi8( _mm_loadu_si128( (__m128i *) ppByte[3] ), BYTE_REVERSE_32 ); ppByte[3] += 16; - - XMM_TRANSPOSE_32( W[r], W[r+1], W[r+2], W[r+3], T0, T1, T2, T3 ); - } - - // - // Expand the message - // - A = W[15]; - B = W[14]; - D = W[0]; - for( r=16; r<64; r+= 2 ) - { - // Loop invariant: A=W[r-1], B = W[r-2], D = W[r-16] - - // - // Macro for one word of message expansion. - // Invariant: - // on entry: a = W[r-1], b = W[r-2], d = W[r-16] - // on exit: W[r] computed, a = W[r-1], b = W[r], c = W[r-15] - // - #define EXPAND( a, b, c, d, r ) \ - c = W[r-15]; \ - b = _mm_add_epi32( _mm_add_epi32( _mm_add_epi32( d, LSIGMA1XMM( b ) ), W[r-7] ), LSIGMA0XMM( c ) ); \ - W[r] = b; \ - - EXPAND( A, B, C, D, r ); - EXPAND( B, A, D, C, (r+1)); - - #undef EXPAND - } - - A = ha[7]; - B = ha[6]; - C = ha[5]; - D = ha[4]; - - for( r=0; r<64; r += 4 ) - { - // - // Loop invariant: - // A, B, C, and D are the a,b,c,d values of the current state. - // W[r] is the next expanded message word to be processed. - // W[r-8 .. r-5] contain the current state words h, g, f, e. - // - - // - // Macro to compute one round - // - #define DO_ROUND( a, b, c, d, t, r ) \ - t = W[r]; \ - t = _mm_add_epi32( t, CSIGMA1XMM( W[r-5] ) ); \ - t = _mm_add_epi32( t, W[r-8] ); \ - t = _mm_add_epi32( t, CHXMM( W[r-5], W[r-6], W[r-7] ) ); \ - t = _mm_add_epi32( t, _mm_set1_epi32( SymCryptSha256K[r] )); \ - W[r-4] = _mm_add_epi32( t, d ); \ - d = _mm_add_epi32( t, CSIGMA0XMM( a ) ); \ - d = _mm_add_epi32( d, MAJXMM( c, b, a ) ); - - DO_ROUND( A, B, C, D, T, r ); - DO_ROUND( D, A, B, C, T, (r+1) ); - DO_ROUND( C, D, A, B, T, (r+2) ); - DO_ROUND( B, C, D, A, T, (r+3) ); - #undef DO_ROUND - } - - buf[3] = ha[7] = _mm_add_epi32( buf[3], A ); - buf[2] = ha[6] = _mm_add_epi32( buf[2], B ); - buf[1] = ha[5] = _mm_add_epi32( buf[1], C ); - buf[0] = ha[4] = _mm_add_epi32( buf[0], D ); - ha[3] = _mm_add_epi32( ha[3], W[r-5] ); - ha[2] = _mm_add_epi32( ha[2], W[r-6] ); - ha[1] = _mm_add_epi32( ha[1], W[r-7] ); - ha[0] = _mm_add_epi32( ha[0], W[r-8] ); - - nBytes -= 64; - } - - // - // Copy the chaining state back into the hash structure - // - XMM_TRANSPOSE_32( T0, T1, T2, T3, ha[7], ha[6], ha[5], ha[4] ); - _mm_storeu_si128( (__m128i *)&pChain[0]->H[0], T0 ); - _mm_storeu_si128( (__m128i *)&pChain[1]->H[0], T1 ); - _mm_storeu_si128( (__m128i *)&pChain[2]->H[0], T2 ); - _mm_storeu_si128( (__m128i *)&pChain[3]->H[0], T3 ); - - XMM_TRANSPOSE_32( T0, T1, T2, T3, ha[3], ha[2], ha[1], ha[0] ); - _mm_storeu_si128( (__m128i *)&pChain[0]->H[4], T0 ); - _mm_storeu_si128( (__m128i *)&pChain[1]->H[4], T1 ); - _mm_storeu_si128( (__m128i *)&pChain[2]->H[4], T2 ); - _mm_storeu_si128( (__m128i *)&pChain[3]->H[4], T3 ); - -} - - -// -// Code that uses the YMM registers. -// - -#define MAJYMM( x, y, z ) _mm256_or_si256( _mm256_and_si256( _mm256_or_si256( z, y ), x ), _mm256_and_si256( z, y )) -#define CHYMM( x, y, z ) _mm256_xor_si256( _mm256_and_si256( _mm256_xor_si256( z, y ), x ), z ) - -#define CSIGMA0YMM( x ) \ - _mm256_xor_si256( _mm256_xor_si256( _mm256_xor_si256( _mm256_xor_si256( _mm256_xor_si256( \ - _mm256_slli_epi32(x,30) , _mm256_srli_epi32(x, 2) ),\ - _mm256_slli_epi32(x,19) ), _mm256_srli_epi32(x, 13) ),\ - _mm256_slli_epi32(x,10) ), _mm256_srli_epi32(x, 22) ) -#define CSIGMA1YMM( x ) \ - _mm256_xor_si256( _mm256_xor_si256( _mm256_xor_si256( _mm256_xor_si256( _mm256_xor_si256( \ - _mm256_slli_epi32(x,26) , _mm256_srli_epi32(x, 6) ),\ - _mm256_slli_epi32(x,21) ), _mm256_srli_epi32(x, 11) ),\ - _mm256_slli_epi32(x,7) ), _mm256_srli_epi32(x, 25) ) -#define LSIGMA0YMM( x ) \ - _mm256_xor_si256( _mm256_xor_si256( _mm256_xor_si256( _mm256_xor_si256( \ - _mm256_slli_epi32(x,25) , _mm256_srli_epi32(x, 7) ),\ - _mm256_slli_epi32(x,14) ), _mm256_srli_epi32(x, 18) ),\ - _mm256_srli_epi32(x, 3) ) -#define LSIGMA1YMM( x ) \ - _mm256_xor_si256( _mm256_xor_si256( _mm256_xor_si256( _mm256_xor_si256( \ - _mm256_slli_epi32(x,15) , _mm256_srli_epi32(x, 17) ),\ - _mm256_slli_epi32(x,13) ), _mm256_srli_epi32(x, 19) ),\ - _mm256_srli_epi32(x,10) ) - -// -// Transpose macro, convert S0..S7 into R0..R7; R0 is the lane 0, R3 is lane 7. -// -// -// S0 = S00, S01, S02, S03, S04, S05, S06, S07 -// S1 = S10, S11, S12, S13, S14, S15, S16, S17 -// S2 = S20, S21, S22, S23, S24, S25, S26, S27 -// S3 = S30, S31, S32, S33, S34, S35, S36, S37 -// S4 = S40, S41, S42, S43, S44, S45, S46, S47 -// S5 = S50, S51, S52, S53, S54, S55, S56, S57 -// S6 = S60, S61, S62, S63, S64, S65, S66, S67 -// S7 = S70, S71, S72, S73, S74, S75, S76, S77 -// -// T0 = S00, S10, S01, S11, S04, S14, S05, S15 -// T1 = S02, S12, S03, S13, S06, S16, S07, S17 -// T2 = S20, S30, S21, S31, S24, S34, S25, S35 -// T3 = S22, S32, S23, S33, S26, S36, S27, S37 -// T4 = S40, S50, S41, S51, S44, S54, S45, S55 -// T5 = S42, S52, S43, S53, S46, S56, S47, S57 -// T6 = S60, S70, S61, S71, S64, S74, S65, S75 -// T7 = S62, S72, S63, S73, S66, S76, S67, S77 -// -// U0 = S00, S10, S20, S30, S04, S14, S24, S34 -// U1 = S01, S11, S21, S31, S05, S15, S25, S35 -// U2 = S02, S12, S22, S32, S06, S16, S26, S36 -// U3 = S03, S13, S23, S33, S07, S17, S27, S37 -// U4 = S40, S50, S60, S70, S44, S54, S64, S74 -// U5 = S41, S51, S61, S71, S45, S55, S65, S75 -// U6 = S42, S52, S62, S72, S46, S56, S66, S76 -// U7 = S43, S53, S63, S73, S47, S47, S67, S77 -// -// R0 = s00, s10, s20, s30, s40, s50, s60, s70 -// R1 = s01, s11, s21, s31, s41, s51, s61, s71 -// R2 = s02, s12, s22, s32, s42, s52, s62, s72 -// R3 = s03, s13, s23, s33, s43, s53, s63, s73 -// R4 = s04, s14, s24, s34, s44, s54, s64, s74 -// R5 = s05, s15, s25, s35, s45, s55, s65, s75 -// R6 = s06, s16, s26, s36, s46, s56, s66, s76 -// R7 = s07, s17, s27, s37, s47, s57, s67, s77 -// -#define YMM_TRANSPOSE_32( _R0, _R1, _R2, _R3, _R4, _R5, _R6, _R7, _S0, _S1, _S2, _S3, _S4, _S5, _S6, _S7 ) \ - {\ - __m256i _T0, _T1, _T2, _T3, _T4, _T5, _T6, _T7;\ - __m256i _U0, _U1, _U2, _U3, _U4, _U5, _U6, _U7;\ - _T0 = _mm256_unpacklo_epi32( _S0, _S1 ); _T1 = _mm256_unpackhi_epi32( _S0, _S1 );\ - _T2 = _mm256_unpacklo_epi32( _S2, _S3 ); _T3 = _mm256_unpackhi_epi32( _S2, _S3 );\ - _T4 = _mm256_unpacklo_epi32( _S4, _S5 ); _T5 = _mm256_unpackhi_epi32( _S4, _S5 );\ - _T6 = _mm256_unpacklo_epi32( _S6, _S7 ); _T7 = _mm256_unpackhi_epi32( _S6, _S7 );\ - \ - _U0 = _mm256_unpacklo_epi64( _T0, _T2 ); _U1 = _mm256_unpackhi_epi64( _T0, _T2 );\ - _U2 = _mm256_unpacklo_epi64( _T1, _T3 ); _U3 = _mm256_unpackhi_epi64( _T1, _T3 );\ - _U4 = _mm256_unpacklo_epi64( _T4, _T6 ); _U5 = _mm256_unpackhi_epi64( _T4, _T6 );\ - _U6 = _mm256_unpacklo_epi64( _T5, _T7 ); _U7 = _mm256_unpackhi_epi64( _T5, _T7 );\ - \ - _R0 = _mm256_permute2x128_si256( _U0, _U4, 0x20 ); _R1 = _mm256_permute2x128_si256( _U1, _U5, 0x20);\ - _R2 = _mm256_permute2x128_si256( _U2, _U6, 0x20 ); _R3 = _mm256_permute2x128_si256( _U3, _U7, 0x20);\ - _R4 = _mm256_permute2x128_si256( _U0, _U4, 0x31 ); _R5 = _mm256_permute2x128_si256( _U1, _U5, 0x31);\ - _R6 = _mm256_permute2x128_si256( _U2, _U6, 0x31 ); _R7 = _mm256_permute2x128_si256( _U3, _U7, 0x31);\ - } - -VOID -SYMCRYPT_CALL -SymCryptParallelSha256AppendBlocks_ymm( - _Inout_updates_( 8 ) PSYMCRYPT_SHA256_CHAINING_STATE * pChain, - _Inout_updates_( 8 ) PCBYTE * ppByte, - SIZE_T nBytes, - _Out_writes_( PAR_SCRATCH_ELEMENTS ) __m256i * pScratch ) -{ - // - // Implementation that uses 8 lanes in the YMM registers - // - __m256i * buf = pScratch; - __m256i * W = &buf[4 + 8]; - __m256i * ha = &buf[4]; // initial state words, in order h, g, ..., b, a - __m256i A, B, C, D, T; - __m256i T0, T1, T2, T3, T4, T5, T6, T7; - __m256i BYTE_REVERSE_32; - int r; - - _mm256_zeroupper(); - BYTE_REVERSE_32 = _mm256_set_epi8( 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3 ); - - // - // The chaining state can be unaligned on x86, so we use unalgned loads - // - - T0 = _mm256_loadu_si256( (__m256i *)&pChain[0]->H[0] ); - T1 = _mm256_loadu_si256( (__m256i *)&pChain[1]->H[0] ); - T2 = _mm256_loadu_si256( (__m256i *)&pChain[2]->H[0] ); - T3 = _mm256_loadu_si256( (__m256i *)&pChain[3]->H[0] ); - T4 = _mm256_loadu_si256( (__m256i *)&pChain[4]->H[0] ); - T5 = _mm256_loadu_si256( (__m256i *)&pChain[5]->H[0] ); - T6 = _mm256_loadu_si256( (__m256i *)&pChain[6]->H[0] ); - T7 = _mm256_loadu_si256( (__m256i *)&pChain[7]->H[0] ); - - YMM_TRANSPOSE_32( ha[7], ha[6], ha[5], ha[4], ha[3], ha[2], ha[1], ha[0], T0, T1, T2, T3, T4, T5, T6, T7 ); - - buf[0] = ha[4]; - buf[1] = ha[5]; - buf[2] = ha[6]; - buf[3] = ha[7]; - - while( nBytes >= 64 ) - { - - // - // Capture the input into W[0..15] - // - for( r=0; r<16; r += 8 ) - { - T0 = _mm256_shuffle_epi8( _mm256_loadu_si256( (__m256i *) ppByte[0] ), BYTE_REVERSE_32 ); ppByte[0] += 32; - T1 = _mm256_shuffle_epi8( _mm256_loadu_si256( (__m256i *) ppByte[1] ), BYTE_REVERSE_32 ); ppByte[1] += 32; - T2 = _mm256_shuffle_epi8( _mm256_loadu_si256( (__m256i *) ppByte[2] ), BYTE_REVERSE_32 ); ppByte[2] += 32; - T3 = _mm256_shuffle_epi8( _mm256_loadu_si256( (__m256i *) ppByte[3] ), BYTE_REVERSE_32 ); ppByte[3] += 32; - T4 = _mm256_shuffle_epi8( _mm256_loadu_si256( (__m256i *) ppByte[4] ), BYTE_REVERSE_32 ); ppByte[4] += 32; - T5 = _mm256_shuffle_epi8( _mm256_loadu_si256( (__m256i *) ppByte[5] ), BYTE_REVERSE_32 ); ppByte[5] += 32; - T6 = _mm256_shuffle_epi8( _mm256_loadu_si256( (__m256i *) ppByte[6] ), BYTE_REVERSE_32 ); ppByte[6] += 32; - T7 = _mm256_shuffle_epi8( _mm256_loadu_si256( (__m256i *) ppByte[7] ), BYTE_REVERSE_32 ); ppByte[7] += 32; - - YMM_TRANSPOSE_32( W[r], W[r+1], W[r+2], W[r+3], W[r+4], W[r+5], W[r+6], W[r+7], T0, T1, T2, T3, T4, T5, T6, T7 ); - } - - // - // Expand the message - // - A = W[15]; - B = W[14]; - D = W[0]; - for( r=16; r<64; r+= 2 ) - { - // Loop invariant: A=W[r-1], B = W[r-2], D = W[r-16] - - // - // Macro for one word of message expansion. - // Invariant: - // on entry: a = W[r-1], b = W[r-2], d = W[r-16] - // on exit: W[r] computed, a = W[r-1], b = W[r], c = W[r-15] - // - #define EXPAND( a, b, c, d, r ) \ - c = W[r-15]; \ - b = _mm256_add_epi32( _mm256_add_epi32( _mm256_add_epi32( d, LSIGMA1YMM( b ) ), W[r-7] ), LSIGMA0YMM( c ) ); \ - W[r] = b; \ - - EXPAND( A, B, C, D, r ); - EXPAND( B, A, D, C, (r+1)); - - #undef EXPAND - } - - A = ha[7]; - B = ha[6]; - C = ha[5]; - D = ha[4]; - - for( r=0; r<64; r += 4 ) - { - // - // Loop invariant: - // A, B, C, and D are the a,b,c,d values of the current state. - // W[r] is the next expanded message word to be processed. - // W[r-8 .. r-5] contain the current state words h, g, f, e. - // - - // - // Macro to compute one round - // - #define DO_ROUND( a, b, c, d, t, r ) \ - t = W[r]; \ - t = _mm256_add_epi32( t, CSIGMA1YMM( W[r-5] ) ); \ - t = _mm256_add_epi32( t, W[r-8] ); \ - t = _mm256_add_epi32( t, CHYMM( W[r-5], W[r-6], W[r-7] ) ); \ - t = _mm256_add_epi32( t, _mm256_set1_epi32( SymCryptSha256K[r] )); \ - W[r-4] = _mm256_add_epi32( t, d ); \ - d = _mm256_add_epi32( t, CSIGMA0YMM( a ) ); \ - d = _mm256_add_epi32( d, MAJYMM( c, b, a ) ); - - DO_ROUND( A, B, C, D, T, r ); - DO_ROUND( D, A, B, C, T, (r+1) ); - DO_ROUND( C, D, A, B, T, (r+2) ); - DO_ROUND( B, C, D, A, T, (r+3) ); - #undef DO_ROUND - } - - buf[3] = ha[7] = _mm256_add_epi32( buf[3], A ); - buf[2] = ha[6] = _mm256_add_epi32( buf[2], B ); - buf[1] = ha[5] = _mm256_add_epi32( buf[1], C ); - buf[0] = ha[4] = _mm256_add_epi32( buf[0], D ); - ha[3] = _mm256_add_epi32( ha[3], W[r-5] ); - ha[2] = _mm256_add_epi32( ha[2], W[r-6] ); - ha[1] = _mm256_add_epi32( ha[1], W[r-7] ); - ha[0] = _mm256_add_epi32( ha[0], W[r-8] ); - - nBytes -= 64; - } - - // - // Copy the chaining state back into the hash structure - // - YMM_TRANSPOSE_32( T0, T1, T2, T3, T4, T5, T6, T7, ha[7], ha[6], ha[5], ha[4], ha[3], ha[2], ha[1], ha[0] ); - _mm256_storeu_si256( (__m256i *)&pChain[0]->H[0], T0 ); - _mm256_storeu_si256( (__m256i *)&pChain[1]->H[0], T1 ); - _mm256_storeu_si256( (__m256i *)&pChain[2]->H[0], T2 ); - _mm256_storeu_si256( (__m256i *)&pChain[3]->H[0], T3 ); - _mm256_storeu_si256( (__m256i *)&pChain[4]->H[0], T4 ); - _mm256_storeu_si256( (__m256i *)&pChain[5]->H[0], T5 ); - _mm256_storeu_si256( (__m256i *)&pChain[6]->H[0], T6 ); - _mm256_storeu_si256( (__m256i *)&pChain[7]->H[0], T7 ); - - _mm256_zeroupper(); - -} - - - -#endif // CPU_X86_X64 - -#if SYMCRYPT_CPU_ARM -// -// Code that uses the Neon registers. -// - -#define MAJ( x, y, z ) vorrq_u32( vandq_u32( vorrq_u32( z, y ), x ), vandq_u32( z, y )) -#define CH( x, y, z ) veorq_u32( vandq_u32( veorq_u32( z, y ), x ), z ) - -#define CSIGMA0( x ) \ - veorq_u32( veorq_u32( veorq_u32( veorq_u32( veorq_u32( \ - vshlq_n_u32(x,30) , vshrq_n_u32(x, 2) ),\ - vshlq_n_u32(x,19) ), vshrq_n_u32(x, 13) ),\ - vshlq_n_u32(x,10) ), vshrq_n_u32(x, 22) ) -#define CSIGMA1( x ) \ - veorq_u32( veorq_u32( veorq_u32( veorq_u32( veorq_u32( \ - vshlq_n_u32(x,26) , vshrq_n_u32(x, 6) ),\ - vshlq_n_u32(x,21) ), vshrq_n_u32(x, 11) ),\ - vshlq_n_u32(x,7) ), vshrq_n_u32(x, 25) ) -#define LSIGMA0( x ) \ - veorq_u32( veorq_u32( veorq_u32( veorq_u32( \ - vshlq_n_u32(x,25) , vshrq_n_u32(x, 7) ),\ - vshlq_n_u32(x,14) ), vshrq_n_u32(x, 18) ),\ - vshrq_n_u32(x, 3) ) -#define LSIGMA1( x ) \ - veorq_u32( veorq_u32( veorq_u32( veorq_u32( \ - vshlq_n_u32(x,15) , vshrq_n_u32(x, 17) ),\ - vshlq_n_u32(x,13) ), vshrq_n_u32(x, 19) ),\ - vshrq_n_u32(x,10) ) - -VOID -SYMCRYPT_CALL -SymCryptParallelSha256AppendBlocks_neon( - _Inout_updates_( 4 ) PSYMCRYPT_SHA256_CHAINING_STATE * pChain, - _Inout_updates_( 4 ) PCBYTE * ppByte, - SIZE_T nBytes, - _Out_writes_( PAR_SCRATCH_ELEMENTS ) __n128 * pScratch ) -{ - // - // Implementation that uses 4 lanes in the Neon registers - // - __n128 * buf = pScratch; - __n128 * W = &buf[4 + 8]; - __n128 * ha = &buf[4]; // initial state words, in order h, g, ..., b, a - __n128 A, B, C, D, T; - __n128 T0; - int r; - - // - // This can probably be done faster, but we are missing the VTRN.64 instruction - // which makes it hard to do this efficient in intrinsics. - // The vsetq_lane_u32 seems to be completely ignored by the compiler. - // - ha[7].n128_u32[0] = pChain[0]->H[0]; - ha[7].n128_u32[1] = pChain[1]->H[0]; - ha[7].n128_u32[2] = pChain[2]->H[0]; - ha[7].n128_u32[3] = pChain[3]->H[0]; - - ha[6].n128_u32[0] = pChain[0]->H[1]; - ha[6].n128_u32[1] = pChain[1]->H[1]; - ha[6].n128_u32[2] = pChain[2]->H[1]; - ha[6].n128_u32[3] = pChain[3]->H[1]; - - ha[5].n128_u32[0] = pChain[0]->H[2]; - ha[5].n128_u32[1] = pChain[1]->H[2]; - ha[5].n128_u32[2] = pChain[2]->H[2]; - ha[5].n128_u32[3] = pChain[3]->H[2]; - - ha[4].n128_u32[0] = pChain[0]->H[3]; - ha[4].n128_u32[1] = pChain[1]->H[3]; - ha[4].n128_u32[2] = pChain[2]->H[3]; - ha[4].n128_u32[3] = pChain[3]->H[3]; - - ha[3].n128_u32[0] = pChain[0]->H[4]; - ha[3].n128_u32[1] = pChain[1]->H[4]; - ha[3].n128_u32[2] = pChain[2]->H[4]; - ha[3].n128_u32[3] = pChain[3]->H[4]; - - ha[2].n128_u32[0] = pChain[0]->H[5]; - ha[2].n128_u32[1] = pChain[1]->H[5]; - ha[2].n128_u32[2] = pChain[2]->H[5]; - ha[2].n128_u32[3] = pChain[3]->H[5]; - - ha[1].n128_u32[0] = pChain[0]->H[6]; - ha[1].n128_u32[1] = pChain[1]->H[6]; - ha[1].n128_u32[2] = pChain[2]->H[6]; - ha[1].n128_u32[3] = pChain[3]->H[6]; - - ha[0].n128_u32[0] = pChain[0]->H[7]; - ha[0].n128_u32[1] = pChain[1]->H[7]; - ha[0].n128_u32[2] = pChain[2]->H[7]; - ha[0].n128_u32[3] = pChain[3]->H[7]; - - - buf[0] = ha[4]; - buf[1] = ha[5]; - buf[2] = ha[6]; - buf[3] = ha[7]; - - while( nBytes >= 64 ) - { - - // - // Capture the input into W[0..15] - // - for( r=0; r<16; r ++ ) - { - T0.n128_u32[0] = SYMCRYPT_LOAD_MSBFIRST32( ppByte[0] ); ppByte[0] += 4; - T0.n128_u32[1] = SYMCRYPT_LOAD_MSBFIRST32( ppByte[1] ); ppByte[1] += 4; - T0.n128_u32[2] = SYMCRYPT_LOAD_MSBFIRST32( ppByte[2] ); ppByte[2] += 4; - T0.n128_u32[3] = SYMCRYPT_LOAD_MSBFIRST32( ppByte[3] ); ppByte[3] += 4; - W[r] = T0; - } - - // - // Expand the message - // - A = W[15]; - B = W[14]; - D = W[0]; - for( r=16; r<64; r+= 2 ) - { - // Loop invariant: A=W[r-1], B = W[r-2], D = W[r-16] - - // - // Macro for one word of message expansion. - // Invariant: - // on entry: a = W[r-1], b = W[r-2], d = W[r-16] - // on exit: W[r] computed, a = W[r-1], b = W[r], c = W[r-15] - // - #define EXPAND( a, b, c, d, r ) \ - c = W[r-15]; \ - b = vaddq_u32( vaddq_u32( vaddq_u32( d, LSIGMA1( b ) ), W[r-7] ), LSIGMA0( c ) ); \ - W[r] = b; \ - - EXPAND( A, B, C, D, r ); - EXPAND( B, A, D, C, (r+1)); - - #undef EXPAND - } - - A = ha[7]; - B = ha[6]; - C = ha[5]; - D = ha[4]; - - for( r=0; r<64; r += 4 ) - { - // - // Loop invariant: - // A, B, C, and D are the a,b,c,d values of the current state. - // W[r] is the next expanded message word to be processed. - // W[r-8 .. r-5] contain the current state words h, g, f, e. - // - - // - // Macro to compute one round - // - #define DO_ROUND( a, b, c, d, t, r ) \ - t = W[r]; \ - t = vaddq_u32( t, CSIGMA1( W[r-5] ) ); \ - t = vaddq_u32( t, W[r-8] ); \ - t = vaddq_u32( t, CH( W[r-5], W[r-6], W[r-7] ) ); \ - t = vaddq_u32( t, vdupq_n_u32( SymCryptSha256K[r] )); \ - W[r-4] = vaddq_u32( t, d ); \ - d = vaddq_u32( t, CSIGMA0( a ) ); \ - d = vaddq_u32( d, MAJ( c, b, a ) ); - - DO_ROUND( A, B, C, D, T, r ); - DO_ROUND( D, A, B, C, T, (r+1) ); - DO_ROUND( C, D, A, B, T, (r+2) ); - DO_ROUND( B, C, D, A, T, (r+3) ); - #undef DO_ROUND - } - - buf[3] = ha[7] = vaddq_u32( buf[3], A ); - buf[2] = ha[6] = vaddq_u32( buf[2], B ); - buf[1] = ha[5] = vaddq_u32( buf[1], C ); - buf[0] = ha[4] = vaddq_u32( buf[0], D ); - ha[3] = vaddq_u32( ha[3], W[r-5] ); - ha[2] = vaddq_u32( ha[2], W[r-6] ); - ha[1] = vaddq_u32( ha[1], W[r-7] ); - ha[0] = vaddq_u32( ha[0], W[r-8] ); - - nBytes -= 64; - } - - // - // Copy the chaining state back into the hash structure - // - - pChain[0]->H[0] = ha[7].n128_u32[0]; - pChain[1]->H[0] = ha[7].n128_u32[1]; - pChain[2]->H[0] = ha[7].n128_u32[2]; - pChain[3]->H[0] = ha[7].n128_u32[3]; - - pChain[0]->H[1] = ha[6].n128_u32[0]; - pChain[1]->H[1] = ha[6].n128_u32[1]; - pChain[2]->H[1] = ha[6].n128_u32[2]; - pChain[3]->H[1] = ha[6].n128_u32[3]; - - pChain[0]->H[2] = ha[5].n128_u32[0]; - pChain[1]->H[2] = ha[5].n128_u32[1]; - pChain[2]->H[2] = ha[5].n128_u32[2]; - pChain[3]->H[2] = ha[5].n128_u32[3]; - - pChain[0]->H[3] = ha[4].n128_u32[0]; - pChain[1]->H[3] = ha[4].n128_u32[1]; - pChain[2]->H[3] = ha[4].n128_u32[2]; - pChain[3]->H[3] = ha[4].n128_u32[3]; - - pChain[0]->H[4] = ha[3].n128_u32[0]; - pChain[1]->H[4] = ha[3].n128_u32[1]; - pChain[2]->H[4] = ha[3].n128_u32[2]; - pChain[3]->H[4] = ha[3].n128_u32[3]; - - pChain[0]->H[5] = ha[2].n128_u32[0]; - pChain[1]->H[5] = ha[2].n128_u32[1]; - pChain[2]->H[5] = ha[2].n128_u32[2]; - pChain[3]->H[5] = ha[2].n128_u32[3]; - - pChain[0]->H[6] = ha[1].n128_u32[0]; - pChain[1]->H[6] = ha[1].n128_u32[1]; - pChain[2]->H[6] = ha[1].n128_u32[2]; - pChain[3]->H[6] = ha[1].n128_u32[3]; - - pChain[0]->H[7] = ha[0].n128_u32[0]; - pChain[1]->H[7] = ha[0].n128_u32[1]; - pChain[2]->H[7] = ha[0].n128_u32[2]; - pChain[3]->H[7] = ha[0].n128_u32[3]; - - SymCryptWipeKnownSize( buf, sizeof( buf ) ); -} - -#undef CH -#undef MAJ -#undef CSIGMA0 -#undef CSIGMA1 -#undef LSIGMA0 -#undef LSIGMA1 - -#endif // CPU_X86_X64 - - - -#if SYMCRYPT_CPU_X86 || SYMCRYPT_CPU_AMD64 || SYMCRYPT_CPU_ARM - -VOID -SYMCRYPT_CALL -SymCryptParallelSha256AppendBytes_serial( - _Inout_updates_( nPar ) PSYMCRYPT_PARALLEL_HASH_SCRATCH_STATE * pWork, - _In_range_(1, MAX_PARALLEL) SIZE_T nPar, - SIZE_T nBytes ) -{ - SIZE_T i; - SIZE_T tmp; - - SYMCRYPT_ASSERT( nBytes % SYMCRYPT_SHA256_INPUT_BLOCK_SIZE == 0 ); - SYMCRYPT_ASSERT( nPar >= 1 && nPar <= MAX_PARALLEL ); - - for( i=0; i < nPar; i++ ) - { - SYMCRYPT_ASSERT( pWork[i]->cbData >= nBytes ); - SymCryptSha256AppendBlocks( & ((PSYMCRYPT_SHA256_STATE)(pWork[i]->hashState))->chain, pWork[i]->pbData, nBytes, &tmp ); - pWork[i]->pbData += nBytes; - pWork[i]->cbData -= nBytes; - } - return; -} - -VOID -SYMCRYPT_CALL -SymCryptParallelSha256Append( - _Inout_updates_( nPar ) PSYMCRYPT_PARALLEL_HASH_SCRATCH_STATE * pWork, - _In_range_(1, MAX_PARALLEL) SIZE_T nPar, - SIZE_T nBytes, - _Out_writes_to_( SYMCRYPT_SIMD_ELEMENT_SIZE * PAR_SCRATCH_ELEMENTS, 0 ) - PBYTE pbSimdScratch, - SIZE_T cbSimdScratch ) -{ - PSYMCRYPT_SHA256_CHAINING_STATE apChain[MAX_PARALLEL]; - PCBYTE apData[MAX_PARALLEL]; - SIZE_T i; - UINT32 maxParallel; - - UNREFERENCED_PARAMETER( cbSimdScratch ); // not referenced on FRE builds - SYMCRYPT_ASSERT( cbSimdScratch >= PAR_SCRATCH_ELEMENTS * SYMCRYPT_SIMD_ELEMENT_SIZE ); - SYMCRYPT_ASSERT( ((UINT_PTR)pbSimdScratch & (SYMCRYPT_SIMD_ELEMENT_SIZE - 1)) == 0 ); - - // - // Compute maxParallel; this is 4 if nPar <= 4, and 8 if nPar = 5, ..., 8. - // This is how many parameter sets we have to set up. - // -#if SYMCRYPT_CPU_X86 | SYMCRYPT_CPU_AMD64 - - maxParallel = (nPar + 3) & ~3; - SYMCRYPT_ASSERT( maxParallel == 4 || (maxParallel == 8 && SYMCRYPT_CPU_FEATURES_PRESENT( SYMCRYPT_CPU_FEATURE_AVX2 )) ); - -#elif SYMCRYPT_CPU_ARM - - maxParallel = 4; - -#endif - - SYMCRYPT_ASSERT( nPar >= 1 && nPar <= maxParallel ); - - if( nPar < MIN_PARALLEL ) - { - SymCryptParallelSha256AppendBytes_serial( pWork, nPar, nBytes ); - - // Done with this function. - goto cleanup; - } - - // - // Our parallel code expects exactly four or eight parallel computations. - // We simply duplicate the first one if we get fewer parallel ones. - // That means we write the result multiple times, but it saves a lot of - // extra if()s in the main codeline. - // - - i = 0; - while( i < nPar ) - { - SYMCRYPT_ASSERT( pWork[i]->cbData >= nBytes ); - apChain[i] = & ((PSYMCRYPT_SHA256_STATE)(pWork[i]->hashState))->chain; - apData[i] = pWork[i]->pbData; - pWork[i]->pbData += nBytes; - pWork[i]->cbData -= nBytes; - i++; - } - - while( i < maxParallel ) - { - apChain[i] = apChain[0]; - apData[i] = apData[0]; - i++; - } - -#if SYMCRYPT_CPU_X86 | SYMCRYPT_CPU_AMD64 - if( maxParallel == 8 ) - { - SymCryptParallelSha256AppendBlocks_ymm( &apChain[0], &apData[0], nBytes, (__m256i *)pbSimdScratch ); - } else { - SymCryptParallelSha256AppendBlocks_xmm( &apChain[0], &apData[0], nBytes, (__m128i *)pbSimdScratch ); - } -#elif SYMCRYPT_CPU_ARM - SymCryptParallelSha256AppendBlocks_neon( &apChain[0], &apData[0], nBytes, (__n128 *) pbSimdScratch ); -#else -#error Unknown CPU -#endif - -cleanup: - ;// no cleanup at this moment. -} - -#endif - -/* -VOID -SYMCRYPT_CALL -SymCryptParallelSha256AppendBlocks( - _Inout_updates_( nWork ) PSYMCRYPT_PARALLEL_SHA256_STATE * pWork, - SIZE_T nWork, - SIZE_T nBytes ) -{ - SIZE_T i; - - SYMCRYPT_ASSERT( nWork >= 1 && nWork <= 4 ); - - for( i=0; i < nWork; i++ ) - { - SYMCRYPT_ASSERT( pWork[i]->cbData >= nBytes ); - SymCryptSha256AppendBlocks( &pWork[i]->internalState.hashState.chain, pWork[i]->pbData, nBytes ); - pWork[i]->pbData += nBytes; - pWork[i]->cbData -= nBytes; - } -} - -*/ - -#endif // SUPPORT_PARALLEL - -#if SUPPORT_PARALLEL - -const SYMCRYPT_PARALLEL_HASH SymCryptParallelSha256Algorithm_default = { - &SymCryptSha256Algorithm_default, - PAR_SCRATCH_ELEMENTS * SYMCRYPT_SIMD_ELEMENT_SIZE, - &SymCryptParallelSha256Result1, - &SymCryptParallelSha256Result2, - &SymCryptParallelSha256ResultDone, - &SymCryptParallelSha256Append, -}; - -#else - -// -// For platforms that do not have a parallel hash implementation -// we use this structure to provide the necessary data to the _serial -// implementation of the function. -// -const SYMCRYPT_PARALLEL_HASH SymCryptParallelSha256Algorithm_default = { - &SymCryptSha256Algorithm_default, - PAR_SCRATCH_ELEMENTS * SYMCRYPT_SIMD_ELEMENT_SIZE, - NULL, - NULL, - NULL, - NULL, -}; - -#endif - -const PCSYMCRYPT_PARALLEL_HASH SymCryptParallelSha256Algorithm = &SymCryptParallelSha256Algorithm_default; - - -#define N_SELFTEST_STATES 5 // Just enough to trigger YMM useage - -VOID -SYMCRYPT_CALL -SymCryptParallelSha256Selftest() -{ - SYMCRYPT_SHA256_STATE states[N_SELFTEST_STATES]; - BYTE result[N_SELFTEST_STATES][SYMCRYPT_SHA256_RESULT_SIZE]; - SYMCRYPT_PARALLEL_HASH_OPERATION op[2*N_SELFTEST_STATES]; - BYTE scratch[SYMCRYPT_PARALLEL_SHA256_FIXED_SCRATCH + N_SELFTEST_STATES * SYMCRYPT_PARALLEL_HASH_PER_STATE_SCRATCH]; - int i; - - SymCryptParallelSha256Init( &states[0], N_SELFTEST_STATES ); - - for( i=0; ibytesInBuffer; + + UNREFERENCED_PARAMETER( pParHash ); + // + // Function is called when a Result is requested from a parallel hashs state. + // Do the first step of the padding. + // + pState->buffer[bytesInBuffer++] = 0x80; + SymCryptWipe( &pState->buffer[bytesInBuffer], SYMCRYPT_SHA256_INPUT_BLOCK_SIZE - bytesInBuffer ); + + pScratch->pbData = &pState->buffer[0]; + pScratch->cbData = SYMCRYPT_SHA256_INPUT_BLOCK_SIZE; + + if( bytesInBuffer > SYMCRYPT_SHA256_INPUT_BLOCK_SIZE - 8 ) + { + // We need 2 blocks for the padding + pScratch->processingState = STATE_RESULT2; + } else { + SYMCRYPT_STORE_MSBFIRST64( &pState->buffer[SYMCRYPT_SHA256_INPUT_BLOCK_SIZE - 8], pState->dataLengthL * 8 ); + pScratch->processingState = STATE_RESULT_DONE; + } + + *pRes = TRUE; // return value from the SetWork function + return TRUE; // Return from the SetWork function +} + + +BOOLEAN +SYMCRYPT_CALL +SymCryptParallelSha256Result2( + _In_ PCSYMCRYPT_PARALLEL_HASH pParHash, + _Inout_ PSYMCRYPT_COMMON_HASH_STATE pState, + _Inout_ PSYMCRYPT_PARALLEL_HASH_SCRATCH_STATE pScratch, + _Out_ BOOLEAN *pRes) +{ + UNREFERENCED_PARAMETER( pParHash ); + // + // Called for the 2nd block of a long padding + // + SymCryptWipe( &pState->buffer[0], SYMCRYPT_SHA256_INPUT_BLOCK_SIZE ); + SYMCRYPT_STORE_MSBFIRST64( &pState->buffer[SYMCRYPT_SHA256_INPUT_BLOCK_SIZE - 8], pState->dataLengthL * 8 ); + pScratch->pbData = &pState->buffer[0]; + pScratch->cbData = SYMCRYPT_SHA256_INPUT_BLOCK_SIZE; + pScratch->processingState = STATE_RESULT_DONE; + *pRes = TRUE; + return TRUE; +} + +VOID +SYMCRYPT_CALL +SymCryptParallelSha256ResultDone( + _In_ PCSYMCRYPT_PARALLEL_HASH pParHash, + _Inout_ PSYMCRYPT_COMMON_HASH_STATE pState, + _In_ PCSYMRYPT_PARALLEL_HASH_OPERATION pOp) +{ + PSYMCRYPT_SHA256_STATE pSha256State = (PSYMCRYPT_SHA256_STATE) pState; + + UNREFERENCED_PARAMETER( pParHash ); + + SYMCRYPT_ASSERT( pOp->hashOperation == SYMCRYPT_HASH_OPERATION_RESULT ); + SYMCRYPT_ASSERT( pOp->cbBuffer == SYMCRYPT_SHA256_RESULT_SIZE ); + + SymCryptUint32ToMsbFirst( &pSha256State->chain.H[0], pOp->pbBuffer, 8 ); + SymCryptWipeKnownSize( pSha256State, sizeof( *pSha256State )); + SymCryptSha256Init( pSha256State ); +} + + +#if 0 + +BOOL +SYMCRYPT_CALL +SymCryptParallelSha256SetNextWork( PSYMCRYPT_PARALLEL_HASH_SCRATCH_STATE pScratch ) +{ + PSYMCRYPT_SHA256_STATE pState; + PCSYMRYPT_PARALLEL_HASH_OPERATION pOp; + UINT32 bytesInBuffer; + UINT32 todo; + + // Retrieve the state we will operate on. + pState = (PSYMCRYPT_SHA256_STATE) pScratch->hashState; + + // + // This is a state machine where some states have to iterate + // The loop allows them to use 'continue' for that. + // +#pragma warning( suppress: 4127 ) // conditional expression is constant + while( TRUE ) + { + // + // At this point, the processing state, pbData/cbData, and next pointer define what needs to be done. + // STATE_NEXT: cbData == 0 and we have to process the remaining operations. + // STATE_DATA_START: We are working on the next operation; the first BytesAlreadyProcessed have been hashed, + // and the hash state has an empty buffer. + // STATE_DATA_END: We are working on the next operation (an append), and pbData/cbData have whatever partial block remains + // after all the whole blocks have been processed. + // STATE_PAD2: We are working on the next operation (a result), and have processed the first half of a 2-block padding. + // STATE_RESULT: We are working on the next operation (a result), and have processed all the padding. + // + // The pState->dataLength is updated whenever we copy bytes from the append into the state's buffer, or when + // we return TRUE and process bulk data. + // + pOp = pScratch->next; + switch( pScratch->processingState ) + { + case STATE_NEXT: + + if( pOp == NULL ) + { + return FALSE; + } + + bytesInBuffer = pState->bytesInBuffer; + + // SYMCRYPT_ASSERT( pOp->cbBuffer < ((SIZE_T)-1)/2 ); // used during testing + + if( pOp->hashOperation == SYMCRYPT_HASH_OPERATION_APPEND ) + { + pState->dataLengthL += pOp->cbBuffer; + if( bytesInBuffer > 0 ) + { + todo = (UINT32) SYMCRYPT_MIN( SYMCRYPT_SHA256_INPUT_BLOCK_SIZE - bytesInBuffer, pOp->cbBuffer ); + memcpy( &pState->buffer[bytesInBuffer], pOp->pbBuffer, todo ); + pState->bytesInBuffer += todo; + if( pState->bytesInBuffer == SYMCRYPT_SHA256_INPUT_BLOCK_SIZE ) + { + // + // We filled the buffer; set it for processing. + // Remember the # bytes we did and set the next state to process the rest of the request. + // + pScratch->pbData = &pState->buffer[0]; + pScratch->cbData = sizeof( pState->buffer ); + pState->bytesInBuffer = 0; + if( todo == pOp->cbBuffer ) + { + // + // We finished the request after the pbData processing + // + pScratch->next = pOp->next; + // pScratch->processingState = STATE_NEXT // already has that value + } else { + pScratch->processingState = STATE_DATA_START; + SYMCRYPT_ASSERT( todo <= 0xff ); + pScratch->bytesAlreadyProcessed = (BYTE) todo; + } + // + // We process the buffer here, no need to update the dataLength + // + return TRUE; + } else { + // + // We finished the operation; skip to the next one. + // + pScratch->next = pOp->next; + // pScratch->processingState = STATE_NEXT // already has that value + continue; + } + } else { + // + // Buffer is empty; process the bulk data + // + pScratch->pbData = pOp->pbBuffer; + pScratch->cbData = pOp->cbBuffer; + pScratch->processingState = STATE_DATA_END; + + // + // Return TRUE if there is real data to process, and just re-run the state + // machine if we should copy the partial block to the buffer. + // + if( pScratch->cbData >= SYMCRYPT_SHA256_INPUT_BLOCK_SIZE ) + { + return TRUE; + } else { + continue; + } + } + } else { + SYMCRYPT_ASSERT( pOp->hashOperation == SYMCRYPT_HASH_OPERATION_RESULT ); + + pState->buffer[bytesInBuffer++] = 0x80; + SymCryptWipe( &pState->buffer[bytesInBuffer], SYMCRYPT_SHA256_INPUT_BLOCK_SIZE - bytesInBuffer ); + + pScratch->pbData = &pState->buffer[0]; + pScratch->cbData = sizeof( pState->buffer ); + + if( bytesInBuffer > SYMCRYPT_SHA256_INPUT_BLOCK_SIZE - 8 ) + { + // We need 2 blocks for the padding + pScratch->processingState = STATE_PAD2; + } else { + SYMCRYPT_STORE_MSBFIRST64( &pState->buffer[SYMCRYPT_SHA256_INPUT_BLOCK_SIZE - 8], pState->dataLengthL * 8 ); + pScratch->processingState = STATE_RESULT; + } + return TRUE; + } + break; + + case STATE_DATA_START: + // + // The next operation is an append, and the first few bytes of that operation have already been copied to + // the buffer and processed. We need to process the rest. + // Note that the # bytes remaining is never zero. + // + SYMCRYPT_ASSERT( pOp->hashOperation == SYMCRYPT_HASH_OPERATION_APPEND && pOp->cbBuffer >= pScratch->bytesAlreadyProcessed ); + + pScratch->pbData = pOp->pbBuffer + pScratch->bytesAlreadyProcessed; + pScratch->cbData = pOp->cbBuffer - pScratch->bytesAlreadyProcessed; + if( pScratch->cbData >= SYMCRYPT_SHA256_INPUT_BLOCK_SIZE ) + { + pScratch->processingState = STATE_DATA_END; + return TRUE; + } + + // + // We have less than one block left; this is exactly the same state as we have at the end of + // a normal append. Fall through to that code. + // + // FALLTHROUGH! + + case STATE_DATA_END: + // + // We finished processing the whole blocks of the pScratch->pbData, and have to process the rest. + // The current append is already popped off the work list. + // + if( pScratch->cbData > 0 ) + { + SYMCRYPT_ASSERT( pScratch->cbData < SYMCRYPT_SHA256_INPUT_BLOCK_SIZE ); + memcpy( &pState->buffer[0], pScratch->pbData, pScratch->cbData ); + pState->bytesInBuffer = (UINT32) pScratch->cbData; + } + pScratch->next = pOp->next; + pScratch->processingState = STATE_NEXT; + continue; + + case STATE_PAD2: + SymCryptWipe( &pState->buffer[0], sizeof( pState->buffer )); + SYMCRYPT_STORE_MSBFIRST64( &pState->buffer[SYMCRYPT_SHA256_INPUT_BLOCK_SIZE - 8], pState->dataLengthL * 8 ); + pScratch->pbData = &pState->buffer[0]; + pScratch->cbData = sizeof( pState->buffer ); + pScratch->processingState = STATE_RESULT; + return TRUE; + + case STATE_RESULT: + SYMCRYPT_ASSERT( pOp->hashOperation == SYMCRYPT_HASH_OPERATION_RESULT ); + + SymCryptUint32ToMsbFirst( &pState->chain.H[0], pOp->pbBuffer, 8 ); + SymCryptWipeKnownSize( pState, sizeof( *pState )); + SymCryptSha256Init( pState ); + + pScratch->next = pOp->next; + pScratch->processingState = STATE_NEXT; + continue; + } + } + +#if 0 // old code, retain until we have the new one working. + ============ old code + + + SIZE_T bytesInBuffer; + SIZE_T todo; + + switch( pScratch->processingState ) + { + case START: + + if( pState->pbData != NULL ) + { + bytesInBuffer = pState->internalState.hashState.dataLength & SYMCRYPT_SHA256_INPUT_BLOCK_SIZE - 1; + + // + // There are bytes in the buffer; consume enough input to get rid of them. + // + if( bytesInBuffer > 0 ) + { + todo = SYMCRYPT_MIN( SYMCRYPT_SHA256_INPUT_BLOCK_SIZE - bytesInBuffer, pState->cbData ); + memcpy( &pState->internalState.hashState.buffer[bytesInBuffer], pState->pbData, todo ); + pState->pbData += todo; + pState->cbData -= todo; + pState->internalState.hashState.dataLength += todo; + + // + // We don't parallelize the processing of the first block to get to the whole-block state. + // It would mean we get a 1-size block up front, and that interferes with the sorted schedulign + // we do. This is not a common case, and we document that this is inefficient. + // + if( (pState->internalState.hashState.dataLength & (SYMCRYPT_SHA256_INPUT_BLOCK_SIZE - 1)) == 0 ) + { + SymCryptSha256AppendBlocks( &pState->internalState.hashState.chain, + &pState->internalState.hashState.buffer[0], + SYMCRYPT_SHA256_INPUT_BLOCK_SIZE ); + } + } + + if( pState->cbData >= SYMCRYPT_SHA256_INPUT_BLOCK_SIZE ) + { + // + // We have more bytes to do; this means that the internal buffer is empty. + // Set the data blocks up for processing. We increment the dataLength here + // as that is part of this function, not of the processing code. + // + pState->internalState.processingState = DATA; + pState->internalState.hashState.dataLength += pState->cbData & ~(SYMCRYPT_SHA256_INPUT_BLOCK_SIZE - 1); + return TRUE; + } + + } + + // + // FALL THROUGH TO THE DATA PROCESSING + // + // There are two cases here: + // - the internal buffer is empty and we have between 1 and 63 bytes left to hash. + // - We have no bytes left to hash, but the internal buffer might contain data. + // The first case is exactly what we get after DATA processing. + // The second case is trivially handled by the same code paths as the first one. + // Instead of duplicating the code, + // we fall through to the DATA section. + // + + pState->internalState.processingState = DATA; + + case DATA: + // + // We just finished the data work, or the START code fell through here to handle the + // padding and/or pbResult + // If we just did data processing, the internal buffer is empty. + // If the internal buffer contains data, then cbData == 0. + // + + if( pState->pbData != NULL && pState->cbData > 0 ) + { + SYMCRYPT_ASSERT( pState->cbData < SYMCRYPT_SHA256_INPUT_BLOCK_SIZE ); + memcpy( &pState->internalState.hashState.buffer[0], pState->pbData, pState->cbData ); + pState->internalState.hashState.dataLength += pState->cbData; + } + + // + // This completes the consumption of the pbData. Set it to NULL as per the API spec. + // + + pState->pbData = NULL; + pState->cbData = 0; + + // + // This concludes the data processing. Now let's see if we have to compute the results + // + + if( pState->pbResult == NULL ) + { + return FALSE; + } + + bytesInBuffer = pState->internalState.hashState.dataLength & SYMCRYPT_SHA256_INPUT_BLOCK_SIZE - 1; + + // Add the first byte of padding. (Always fits as the buffer is never left full.) + pState->internalState.hashState.buffer[bytesInBuffer++] = 0x80; + SymCryptWipe( &pState->internalState.hashState.buffer[bytesInBuffer], SYMCRYPT_SHA256_INPUT_BLOCK_SIZE - bytesInBuffer ); + + if( bytesInBuffer > SYMCRYPT_SHA256_INPUT_BLOCK_SIZE - 8 ) + { + // + // We need 2 blocks for the padding. + // + pState->internalState.processingState = PAD_INTERMEDIATE; + pState->pbData = &pState->internalState.hashState.buffer[0]; + pState->cbData = SYMCRYPT_SHA256_INPUT_BLOCK_SIZE; + + return TRUE; + } + + // + // Single padding block + // + SYMCRYPT_STORE_MSBFIRST64( &pState->internalState.hashState.buffer[SYMCRYPT_SHA256_INPUT_BLOCK_SIZE - 8], pState->internalState.hashState.dataLength * 8 ); + pState->internalState.processingState = PAD_FINAL; + pState->pbData = &pState->internalState.hashState.buffer[0]; + pState->cbData = SYMCRYPT_SHA256_INPUT_BLOCK_SIZE; + return TRUE; + + case PAD_INTERMEDIATE: + // + // Done with the intermediate padding, do the final padding. + // We wipe to the end of the buffer, as it is 16-aligned and therefore often faster + // + SymCryptWipe( &pState->internalState.hashState.buffer[0], SYMCRYPT_SHA256_INPUT_BLOCK_SIZE ); + SYMCRYPT_STORE_MSBFIRST64( &pState->internalState.hashState.buffer[SYMCRYPT_SHA256_INPUT_BLOCK_SIZE - 8], pState->internalState.hashState.dataLength * 8 ); + + pState->internalState.processingState = PAD_FINAL; + pState->pbData = &pState->internalState.hashState.buffer[0]; + pState->cbData = SYMCRYPT_SHA256_INPUT_BLOCK_SIZE; + return TRUE; + + case PAD_FINAL: + SymCryptUint32ToMsbFirst( &pState->internalState.hashState.chain.H[0], pState->pbResult, 8 ); + + SymCryptWipeKnownSize( pState, sizeof( *pState ) ); + SymCryptSha256Init( &pState->internalState.hashState ); + SYMCRYPT_SET_MAGIC( &pState->internalState ); + return FALSE; + } +#endif + + SymCryptFatal( 'psha' ); + return FALSE; +} +#endif + +C_ASSERT( (SYMCRYPT_SIMD_ELEMENT_SIZE & (SYMCRYPT_SIMD_ELEMENT_SIZE - 1 )) == 0 ); // check that it is a power of 2 + + +VOID +SYMCRYPT_CALL +SymCryptParallelSha256Process( + _Inout_updates_( nStates ) PSYMCRYPT_SHA256_STATE pStates, + SIZE_T nStates, + _Inout_updates_( nOperations ) PSYMCRYPT_PARALLEL_HASH_OPERATION pOperations, + SIZE_T nOperations, + _Out_writes_( cbScratch ) PBYTE pbScratch, + SIZE_T cbScratch ) +{ + UINT32 maxParallel; + +#if SYMCRYPT_CPU_X86 | SYMCRYPT_CPU_AMD64 + SYMCRYPT_EXTENDED_SAVE_DATA SaveState; + + if( SYMCRYPT_CPU_FEATURES_PRESENT( SYMCRYPT_CPU_FEATURE_AVX2 ) && SymCryptSaveYmm( &SaveState ) == SYMCRYPT_NO_ERROR ) + { + maxParallel = 8; + + SymCryptParallelHashProcess( SymCryptParallelSha256Algorithm, + pStates, + nStates, + pOperations, + nOperations, + pbScratch, + cbScratch, + maxParallel ); + + SymCryptRestoreYmm( &SaveState ); + } else if( SYMCRYPT_CPU_FEATURES_PRESENT( SYMCRYPT_CPU_FEATURE_SSSE3 ) && SymCryptSaveXmm( &SaveState ) == SYMCRYPT_NO_ERROR ) + { + maxParallel = 4; + SymCryptParallelHashProcess( SymCryptParallelSha256Algorithm, + pStates, + nStates, + pOperations, + nOperations, + pbScratch, + cbScratch, + maxParallel ); + SymCryptRestoreXmm( &SaveState ); + } else { + SymCryptParallelHashProcess_serial( SymCryptParallelSha256Algorithm, pStates, nStates, pOperations, nOperations, pbScratch, cbScratch ); + } + +#elif SYMCRYPT_CPU_ARM + maxParallel = MAX_PARALLEL; + + if( SYMCRYPT_CPU_FEATURES_PRESENT( SYMCRYPT_CPU_FEATURE_NEON ) ) + { + SymCryptParallelHashProcess( SymCryptParallelSha256Algorithm, + pStates, + nStates, + pOperations, + nOperations, + pbScratch, + cbScratch, + maxParallel ); + } else { + SymCryptParallelHashProcess_serial( SymCryptParallelSha256Algorithm, pStates, nStates, pOperations, nOperations, pbScratch, cbScratch ); + } +#else + SymCryptParallelHashProcess_serial( SymCryptParallelSha256Algorithm, pStates, nStates, pOperations, nOperations, pbScratch, cbScratch ); +#endif +} + + +#if SYMCRYPT_CPU_X86 | SYMCRYPT_CPU_AMD64 +// +// Code that uses the XMM registers. +// + +#define MAJXMM( x, y, z ) _mm_or_si128( _mm_and_si128( _mm_or_si128( z, y ), x ), _mm_and_si128( z, y )) +#define CHXMM( x, y, z ) _mm_xor_si128( _mm_and_si128( _mm_xor_si128( z, y ), x ), z ) + +#define CSIGMA0XMM( x ) \ + _mm_xor_si128( _mm_xor_si128( _mm_xor_si128( _mm_xor_si128( _mm_xor_si128( \ + _mm_slli_epi32(x,30) , _mm_srli_epi32(x, 2) ),\ + _mm_slli_epi32(x,19) ), _mm_srli_epi32(x, 13) ),\ + _mm_slli_epi32(x,10) ), _mm_srli_epi32(x, 22) ) +#define CSIGMA1XMM( x ) \ + _mm_xor_si128( _mm_xor_si128( _mm_xor_si128( _mm_xor_si128( _mm_xor_si128( \ + _mm_slli_epi32(x,26) , _mm_srli_epi32(x, 6) ),\ + _mm_slli_epi32(x,21) ), _mm_srli_epi32(x, 11) ),\ + _mm_slli_epi32(x,7) ), _mm_srli_epi32(x, 25) ) +#define LSIGMA0XMM( x ) \ + _mm_xor_si128( _mm_xor_si128( _mm_xor_si128( _mm_xor_si128( \ + _mm_slli_epi32(x,25) , _mm_srli_epi32(x, 7) ),\ + _mm_slli_epi32(x,14) ), _mm_srli_epi32(x, 18) ),\ + _mm_srli_epi32(x, 3) ) +#define LSIGMA1XMM( x ) \ + _mm_xor_si128( _mm_xor_si128( _mm_xor_si128( _mm_xor_si128( \ + _mm_slli_epi32(x,15) , _mm_srli_epi32(x, 17) ),\ + _mm_slli_epi32(x,13) ), _mm_srli_epi32(x, 19) ),\ + _mm_srli_epi32(x,10) ) + +// +// Transpose macro, convert S0..S3 into R0..R3; R0 is the lane 0, R3 is lane 3. +// S0 = S00, S01, S02, S03; S1 = S10, S11, S12, S13; S2 = S20, S21, S22, S23; S3 = S30, S31, S32, S33 +// T0 = S00, S10, S01, S11; T1 = S02, S12, S03, S13; T2 = S20, S30, S21, S31; T3 = S22, S32, S23, S33 +// R0 = S00, S10, S20, S30; R1 = S01, S11, S21, S31; R2 = S02, S12, S22, S32; R3 = S03, S13, S23, S33 +// +#define XMM_TRANSPOSE_32( _R0, _R1, _R2, _R3, _S0, _S1, _S2, _S3 ) \ + {\ + __m128i _T0, _T1, _T2, _T3;\ + _T0 = _mm_unpacklo_epi32( _S0, _S1 ); _T1 = _mm_unpackhi_epi32( _S0, _S1 );\ + _T2 = _mm_unpacklo_epi32( _S2, _S3 ); _T3 = _mm_unpackhi_epi32( _S2, _S3 );\ + _R0 = _mm_unpacklo_epi64( _T0, _T2 ); _R1 = _mm_unpackhi_epi64( _T0, _T2 );\ + _R2 = _mm_unpacklo_epi64( _T1, _T3 ); _R3 = _mm_unpackhi_epi64( _T1, _T3 );\ + } + +VOID +SYMCRYPT_CALL +SymCryptParallelSha256AppendBlocks_xmm( + _Inout_updates_( 4 ) PSYMCRYPT_SHA256_CHAINING_STATE * pChain, + _Inout_updates_( 4 ) PCBYTE * ppByte, + SIZE_T nBytes, + _Out_writes_( PAR_SCRATCH_ELEMENTS ) __m128i * pScratch ) +{ + // + // Implementation that uses 4 lanes in the XMM registers + // + __m128i * buf = pScratch; // chaining state concatenated with the expanded input block + __m128i * W = &buf[4 + 8]; // W are the 64 words of the expanded input + __m128i * ha = &buf[4]; // initial state words, in order h, g, ..., b, a + __m128i A, B, C, D, T; + __m128i T0, T1, T2, T3; + const __m128i BYTE_REVERSE_32 = _mm_set_epi8( 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3 ); + int r; + + // + // The chaining state can be unaligned on x86, so we use unalgned loads + // + + T0 = _mm_loadu_si128( (__m128i *)&pChain[0]->H[0] ); + T1 = _mm_loadu_si128( (__m128i *)&pChain[1]->H[0] ); + T2 = _mm_loadu_si128( (__m128i *)&pChain[2]->H[0] ); + T3 = _mm_loadu_si128( (__m128i *)&pChain[3]->H[0] ); + + XMM_TRANSPOSE_32( ha[7], ha[6], ha[5], ha[4], T0, T1, T2, T3 ); + + T0 = _mm_loadu_si128( (__m128i *)&pChain[0]->H[4] ); + T1 = _mm_loadu_si128( (__m128i *)&pChain[1]->H[4] ); + T2 = _mm_loadu_si128( (__m128i *)&pChain[2]->H[4] ); + T3 = _mm_loadu_si128( (__m128i *)&pChain[3]->H[4] ); + + XMM_TRANSPOSE_32( ha[3], ha[2], ha[1], ha[0], T0, T1, T2, T3 ); + + buf[0] = ha[4]; + buf[1] = ha[5]; + buf[2] = ha[6]; + buf[3] = ha[7]; + + while( nBytes >= 64 ) + { + + // + // Capture the input into W[0..15] + // + for( r=0; r<16; r += 4 ) + { + T0 = _mm_shuffle_epi8( _mm_loadu_si128( (__m128i *) ppByte[0] ), BYTE_REVERSE_32 ); ppByte[0] += 16; + T1 = _mm_shuffle_epi8( _mm_loadu_si128( (__m128i *) ppByte[1] ), BYTE_REVERSE_32 ); ppByte[1] += 16; + T2 = _mm_shuffle_epi8( _mm_loadu_si128( (__m128i *) ppByte[2] ), BYTE_REVERSE_32 ); ppByte[2] += 16; + T3 = _mm_shuffle_epi8( _mm_loadu_si128( (__m128i *) ppByte[3] ), BYTE_REVERSE_32 ); ppByte[3] += 16; + + XMM_TRANSPOSE_32( W[r], W[r+1], W[r+2], W[r+3], T0, T1, T2, T3 ); + } + + // + // Expand the message + // + A = W[15]; + B = W[14]; + D = W[0]; + for( r=16; r<64; r+= 2 ) + { + // Loop invariant: A=W[r-1], B = W[r-2], D = W[r-16] + + // + // Macro for one word of message expansion. + // Invariant: + // on entry: a = W[r-1], b = W[r-2], d = W[r-16] + // on exit: W[r] computed, a = W[r-1], b = W[r], c = W[r-15] + // + #define EXPAND( a, b, c, d, r ) \ + c = W[r-15]; \ + b = _mm_add_epi32( _mm_add_epi32( _mm_add_epi32( d, LSIGMA1XMM( b ) ), W[r-7] ), LSIGMA0XMM( c ) ); \ + W[r] = b; \ + + EXPAND( A, B, C, D, r ); + EXPAND( B, A, D, C, (r+1)); + + #undef EXPAND + } + + A = ha[7]; + B = ha[6]; + C = ha[5]; + D = ha[4]; + + for( r=0; r<64; r += 4 ) + { + // + // Loop invariant: + // A, B, C, and D are the a,b,c,d values of the current state. + // W[r] is the next expanded message word to be processed. + // W[r-8 .. r-5] contain the current state words h, g, f, e. + // + + // + // Macro to compute one round + // + #define DO_ROUND( a, b, c, d, t, r ) \ + t = W[r]; \ + t = _mm_add_epi32( t, CSIGMA1XMM( W[r-5] ) ); \ + t = _mm_add_epi32( t, W[r-8] ); \ + t = _mm_add_epi32( t, CHXMM( W[r-5], W[r-6], W[r-7] ) ); \ + t = _mm_add_epi32( t, _mm_set1_epi32( SymCryptSha256K[r] )); \ + W[r-4] = _mm_add_epi32( t, d ); \ + d = _mm_add_epi32( t, CSIGMA0XMM( a ) ); \ + d = _mm_add_epi32( d, MAJXMM( c, b, a ) ); + + DO_ROUND( A, B, C, D, T, r ); + DO_ROUND( D, A, B, C, T, (r+1) ); + DO_ROUND( C, D, A, B, T, (r+2) ); + DO_ROUND( B, C, D, A, T, (r+3) ); + #undef DO_ROUND + } + + buf[3] = ha[7] = _mm_add_epi32( buf[3], A ); + buf[2] = ha[6] = _mm_add_epi32( buf[2], B ); + buf[1] = ha[5] = _mm_add_epi32( buf[1], C ); + buf[0] = ha[4] = _mm_add_epi32( buf[0], D ); + ha[3] = _mm_add_epi32( ha[3], W[r-5] ); + ha[2] = _mm_add_epi32( ha[2], W[r-6] ); + ha[1] = _mm_add_epi32( ha[1], W[r-7] ); + ha[0] = _mm_add_epi32( ha[0], W[r-8] ); + + nBytes -= 64; + } + + // + // Copy the chaining state back into the hash structure + // + XMM_TRANSPOSE_32( T0, T1, T2, T3, ha[7], ha[6], ha[5], ha[4] ); + _mm_storeu_si128( (__m128i *)&pChain[0]->H[0], T0 ); + _mm_storeu_si128( (__m128i *)&pChain[1]->H[0], T1 ); + _mm_storeu_si128( (__m128i *)&pChain[2]->H[0], T2 ); + _mm_storeu_si128( (__m128i *)&pChain[3]->H[0], T3 ); + + XMM_TRANSPOSE_32( T0, T1, T2, T3, ha[3], ha[2], ha[1], ha[0] ); + _mm_storeu_si128( (__m128i *)&pChain[0]->H[4], T0 ); + _mm_storeu_si128( (__m128i *)&pChain[1]->H[4], T1 ); + _mm_storeu_si128( (__m128i *)&pChain[2]->H[4], T2 ); + _mm_storeu_si128( (__m128i *)&pChain[3]->H[4], T3 ); + +} + + +// +// Code that uses the YMM registers. +// + +#define MAJYMM( x, y, z ) _mm256_or_si256( _mm256_and_si256( _mm256_or_si256( z, y ), x ), _mm256_and_si256( z, y )) +#define CHYMM( x, y, z ) _mm256_xor_si256( _mm256_and_si256( _mm256_xor_si256( z, y ), x ), z ) + +#define CSIGMA0YMM( x ) \ + _mm256_xor_si256( _mm256_xor_si256( _mm256_xor_si256( _mm256_xor_si256( _mm256_xor_si256( \ + _mm256_slli_epi32(x,30) , _mm256_srli_epi32(x, 2) ),\ + _mm256_slli_epi32(x,19) ), _mm256_srli_epi32(x, 13) ),\ + _mm256_slli_epi32(x,10) ), _mm256_srli_epi32(x, 22) ) +#define CSIGMA1YMM( x ) \ + _mm256_xor_si256( _mm256_xor_si256( _mm256_xor_si256( _mm256_xor_si256( _mm256_xor_si256( \ + _mm256_slli_epi32(x,26) , _mm256_srli_epi32(x, 6) ),\ + _mm256_slli_epi32(x,21) ), _mm256_srli_epi32(x, 11) ),\ + _mm256_slli_epi32(x,7) ), _mm256_srli_epi32(x, 25) ) +#define LSIGMA0YMM( x ) \ + _mm256_xor_si256( _mm256_xor_si256( _mm256_xor_si256( _mm256_xor_si256( \ + _mm256_slli_epi32(x,25) , _mm256_srli_epi32(x, 7) ),\ + _mm256_slli_epi32(x,14) ), _mm256_srli_epi32(x, 18) ),\ + _mm256_srli_epi32(x, 3) ) +#define LSIGMA1YMM( x ) \ + _mm256_xor_si256( _mm256_xor_si256( _mm256_xor_si256( _mm256_xor_si256( \ + _mm256_slli_epi32(x,15) , _mm256_srli_epi32(x, 17) ),\ + _mm256_slli_epi32(x,13) ), _mm256_srli_epi32(x, 19) ),\ + _mm256_srli_epi32(x,10) ) + +// +// Transpose macro, convert S0..S7 into R0..R7; R0 is the lane 0, R3 is lane 7. +// +// +// S0 = S00, S01, S02, S03, S04, S05, S06, S07 +// S1 = S10, S11, S12, S13, S14, S15, S16, S17 +// S2 = S20, S21, S22, S23, S24, S25, S26, S27 +// S3 = S30, S31, S32, S33, S34, S35, S36, S37 +// S4 = S40, S41, S42, S43, S44, S45, S46, S47 +// S5 = S50, S51, S52, S53, S54, S55, S56, S57 +// S6 = S60, S61, S62, S63, S64, S65, S66, S67 +// S7 = S70, S71, S72, S73, S74, S75, S76, S77 +// +// T0 = S00, S10, S01, S11, S04, S14, S05, S15 +// T1 = S02, S12, S03, S13, S06, S16, S07, S17 +// T2 = S20, S30, S21, S31, S24, S34, S25, S35 +// T3 = S22, S32, S23, S33, S26, S36, S27, S37 +// T4 = S40, S50, S41, S51, S44, S54, S45, S55 +// T5 = S42, S52, S43, S53, S46, S56, S47, S57 +// T6 = S60, S70, S61, S71, S64, S74, S65, S75 +// T7 = S62, S72, S63, S73, S66, S76, S67, S77 +// +// U0 = S00, S10, S20, S30, S04, S14, S24, S34 +// U1 = S01, S11, S21, S31, S05, S15, S25, S35 +// U2 = S02, S12, S22, S32, S06, S16, S26, S36 +// U3 = S03, S13, S23, S33, S07, S17, S27, S37 +// U4 = S40, S50, S60, S70, S44, S54, S64, S74 +// U5 = S41, S51, S61, S71, S45, S55, S65, S75 +// U6 = S42, S52, S62, S72, S46, S56, S66, S76 +// U7 = S43, S53, S63, S73, S47, S47, S67, S77 +// +// R0 = s00, s10, s20, s30, s40, s50, s60, s70 +// R1 = s01, s11, s21, s31, s41, s51, s61, s71 +// R2 = s02, s12, s22, s32, s42, s52, s62, s72 +// R3 = s03, s13, s23, s33, s43, s53, s63, s73 +// R4 = s04, s14, s24, s34, s44, s54, s64, s74 +// R5 = s05, s15, s25, s35, s45, s55, s65, s75 +// R6 = s06, s16, s26, s36, s46, s56, s66, s76 +// R7 = s07, s17, s27, s37, s47, s57, s67, s77 +// +#define YMM_TRANSPOSE_32( _R0, _R1, _R2, _R3, _R4, _R5, _R6, _R7, _S0, _S1, _S2, _S3, _S4, _S5, _S6, _S7 ) \ + {\ + __m256i _T0, _T1, _T2, _T3, _T4, _T5, _T6, _T7;\ + __m256i _U0, _U1, _U2, _U3, _U4, _U5, _U6, _U7;\ + _T0 = _mm256_unpacklo_epi32( _S0, _S1 ); _T1 = _mm256_unpackhi_epi32( _S0, _S1 );\ + _T2 = _mm256_unpacklo_epi32( _S2, _S3 ); _T3 = _mm256_unpackhi_epi32( _S2, _S3 );\ + _T4 = _mm256_unpacklo_epi32( _S4, _S5 ); _T5 = _mm256_unpackhi_epi32( _S4, _S5 );\ + _T6 = _mm256_unpacklo_epi32( _S6, _S7 ); _T7 = _mm256_unpackhi_epi32( _S6, _S7 );\ + \ + _U0 = _mm256_unpacklo_epi64( _T0, _T2 ); _U1 = _mm256_unpackhi_epi64( _T0, _T2 );\ + _U2 = _mm256_unpacklo_epi64( _T1, _T3 ); _U3 = _mm256_unpackhi_epi64( _T1, _T3 );\ + _U4 = _mm256_unpacklo_epi64( _T4, _T6 ); _U5 = _mm256_unpackhi_epi64( _T4, _T6 );\ + _U6 = _mm256_unpacklo_epi64( _T5, _T7 ); _U7 = _mm256_unpackhi_epi64( _T5, _T7 );\ + \ + _R0 = _mm256_permute2x128_si256( _U0, _U4, 0x20 ); _R1 = _mm256_permute2x128_si256( _U1, _U5, 0x20);\ + _R2 = _mm256_permute2x128_si256( _U2, _U6, 0x20 ); _R3 = _mm256_permute2x128_si256( _U3, _U7, 0x20);\ + _R4 = _mm256_permute2x128_si256( _U0, _U4, 0x31 ); _R5 = _mm256_permute2x128_si256( _U1, _U5, 0x31);\ + _R6 = _mm256_permute2x128_si256( _U2, _U6, 0x31 ); _R7 = _mm256_permute2x128_si256( _U3, _U7, 0x31);\ + } + +VOID +SYMCRYPT_CALL +SymCryptParallelSha256AppendBlocks_ymm( + _Inout_updates_( 8 ) PSYMCRYPT_SHA256_CHAINING_STATE * pChain, + _Inout_updates_( 8 ) PCBYTE * ppByte, + SIZE_T nBytes, + _Out_writes_( PAR_SCRATCH_ELEMENTS ) __m256i * pScratch ) +{ + // + // Implementation that uses 8 lanes in the YMM registers + // + __m256i * buf = pScratch; + __m256i * W = &buf[4 + 8]; + __m256i * ha = &buf[4]; // initial state words, in order h, g, ..., b, a + __m256i A, B, C, D, T; + __m256i T0, T1, T2, T3, T4, T5, T6, T7; + __m256i BYTE_REVERSE_32; + int r; + + _mm256_zeroupper(); + BYTE_REVERSE_32 = _mm256_set_epi8( 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3 ); + + // + // The chaining state can be unaligned on x86, so we use unalgned loads + // + + T0 = _mm256_loadu_si256( (__m256i *)&pChain[0]->H[0] ); + T1 = _mm256_loadu_si256( (__m256i *)&pChain[1]->H[0] ); + T2 = _mm256_loadu_si256( (__m256i *)&pChain[2]->H[0] ); + T3 = _mm256_loadu_si256( (__m256i *)&pChain[3]->H[0] ); + T4 = _mm256_loadu_si256( (__m256i *)&pChain[4]->H[0] ); + T5 = _mm256_loadu_si256( (__m256i *)&pChain[5]->H[0] ); + T6 = _mm256_loadu_si256( (__m256i *)&pChain[6]->H[0] ); + T7 = _mm256_loadu_si256( (__m256i *)&pChain[7]->H[0] ); + + YMM_TRANSPOSE_32( ha[7], ha[6], ha[5], ha[4], ha[3], ha[2], ha[1], ha[0], T0, T1, T2, T3, T4, T5, T6, T7 ); + + buf[0] = ha[4]; + buf[1] = ha[5]; + buf[2] = ha[6]; + buf[3] = ha[7]; + + while( nBytes >= 64 ) + { + + // + // Capture the input into W[0..15] + // + for( r=0; r<16; r += 8 ) + { + T0 = _mm256_shuffle_epi8( _mm256_loadu_si256( (__m256i *) ppByte[0] ), BYTE_REVERSE_32 ); ppByte[0] += 32; + T1 = _mm256_shuffle_epi8( _mm256_loadu_si256( (__m256i *) ppByte[1] ), BYTE_REVERSE_32 ); ppByte[1] += 32; + T2 = _mm256_shuffle_epi8( _mm256_loadu_si256( (__m256i *) ppByte[2] ), BYTE_REVERSE_32 ); ppByte[2] += 32; + T3 = _mm256_shuffle_epi8( _mm256_loadu_si256( (__m256i *) ppByte[3] ), BYTE_REVERSE_32 ); ppByte[3] += 32; + T4 = _mm256_shuffle_epi8( _mm256_loadu_si256( (__m256i *) ppByte[4] ), BYTE_REVERSE_32 ); ppByte[4] += 32; + T5 = _mm256_shuffle_epi8( _mm256_loadu_si256( (__m256i *) ppByte[5] ), BYTE_REVERSE_32 ); ppByte[5] += 32; + T6 = _mm256_shuffle_epi8( _mm256_loadu_si256( (__m256i *) ppByte[6] ), BYTE_REVERSE_32 ); ppByte[6] += 32; + T7 = _mm256_shuffle_epi8( _mm256_loadu_si256( (__m256i *) ppByte[7] ), BYTE_REVERSE_32 ); ppByte[7] += 32; + + YMM_TRANSPOSE_32( W[r], W[r+1], W[r+2], W[r+3], W[r+4], W[r+5], W[r+6], W[r+7], T0, T1, T2, T3, T4, T5, T6, T7 ); + } + + // + // Expand the message + // + A = W[15]; + B = W[14]; + D = W[0]; + for( r=16; r<64; r+= 2 ) + { + // Loop invariant: A=W[r-1], B = W[r-2], D = W[r-16] + + // + // Macro for one word of message expansion. + // Invariant: + // on entry: a = W[r-1], b = W[r-2], d = W[r-16] + // on exit: W[r] computed, a = W[r-1], b = W[r], c = W[r-15] + // + #define EXPAND( a, b, c, d, r ) \ + c = W[r-15]; \ + b = _mm256_add_epi32( _mm256_add_epi32( _mm256_add_epi32( d, LSIGMA1YMM( b ) ), W[r-7] ), LSIGMA0YMM( c ) ); \ + W[r] = b; \ + + EXPAND( A, B, C, D, r ); + EXPAND( B, A, D, C, (r+1)); + + #undef EXPAND + } + + A = ha[7]; + B = ha[6]; + C = ha[5]; + D = ha[4]; + + for( r=0; r<64; r += 4 ) + { + // + // Loop invariant: + // A, B, C, and D are the a,b,c,d values of the current state. + // W[r] is the next expanded message word to be processed. + // W[r-8 .. r-5] contain the current state words h, g, f, e. + // + + // + // Macro to compute one round + // + #define DO_ROUND( a, b, c, d, t, r ) \ + t = W[r]; \ + t = _mm256_add_epi32( t, CSIGMA1YMM( W[r-5] ) ); \ + t = _mm256_add_epi32( t, W[r-8] ); \ + t = _mm256_add_epi32( t, CHYMM( W[r-5], W[r-6], W[r-7] ) ); \ + t = _mm256_add_epi32( t, _mm256_set1_epi32( SymCryptSha256K[r] )); \ + W[r-4] = _mm256_add_epi32( t, d ); \ + d = _mm256_add_epi32( t, CSIGMA0YMM( a ) ); \ + d = _mm256_add_epi32( d, MAJYMM( c, b, a ) ); + + DO_ROUND( A, B, C, D, T, r ); + DO_ROUND( D, A, B, C, T, (r+1) ); + DO_ROUND( C, D, A, B, T, (r+2) ); + DO_ROUND( B, C, D, A, T, (r+3) ); + #undef DO_ROUND + } + + buf[3] = ha[7] = _mm256_add_epi32( buf[3], A ); + buf[2] = ha[6] = _mm256_add_epi32( buf[2], B ); + buf[1] = ha[5] = _mm256_add_epi32( buf[1], C ); + buf[0] = ha[4] = _mm256_add_epi32( buf[0], D ); + ha[3] = _mm256_add_epi32( ha[3], W[r-5] ); + ha[2] = _mm256_add_epi32( ha[2], W[r-6] ); + ha[1] = _mm256_add_epi32( ha[1], W[r-7] ); + ha[0] = _mm256_add_epi32( ha[0], W[r-8] ); + + nBytes -= 64; + } + + // + // Copy the chaining state back into the hash structure + // + YMM_TRANSPOSE_32( T0, T1, T2, T3, T4, T5, T6, T7, ha[7], ha[6], ha[5], ha[4], ha[3], ha[2], ha[1], ha[0] ); + _mm256_storeu_si256( (__m256i *)&pChain[0]->H[0], T0 ); + _mm256_storeu_si256( (__m256i *)&pChain[1]->H[0], T1 ); + _mm256_storeu_si256( (__m256i *)&pChain[2]->H[0], T2 ); + _mm256_storeu_si256( (__m256i *)&pChain[3]->H[0], T3 ); + _mm256_storeu_si256( (__m256i *)&pChain[4]->H[0], T4 ); + _mm256_storeu_si256( (__m256i *)&pChain[5]->H[0], T5 ); + _mm256_storeu_si256( (__m256i *)&pChain[6]->H[0], T6 ); + _mm256_storeu_si256( (__m256i *)&pChain[7]->H[0], T7 ); + + _mm256_zeroupper(); + +} + + + +#endif // CPU_X86_X64 + +#if SYMCRYPT_CPU_ARM +// +// Code that uses the Neon registers. +// + +#define MAJ( x, y, z ) vorrq_u32( vandq_u32( vorrq_u32( z, y ), x ), vandq_u32( z, y )) +#define CH( x, y, z ) veorq_u32( vandq_u32( veorq_u32( z, y ), x ), z ) + +#define CSIGMA0( x ) \ + veorq_u32( veorq_u32( veorq_u32( veorq_u32( veorq_u32( \ + vshlq_n_u32(x,30) , vshrq_n_u32(x, 2) ),\ + vshlq_n_u32(x,19) ), vshrq_n_u32(x, 13) ),\ + vshlq_n_u32(x,10) ), vshrq_n_u32(x, 22) ) +#define CSIGMA1( x ) \ + veorq_u32( veorq_u32( veorq_u32( veorq_u32( veorq_u32( \ + vshlq_n_u32(x,26) , vshrq_n_u32(x, 6) ),\ + vshlq_n_u32(x,21) ), vshrq_n_u32(x, 11) ),\ + vshlq_n_u32(x,7) ), vshrq_n_u32(x, 25) ) +#define LSIGMA0( x ) \ + veorq_u32( veorq_u32( veorq_u32( veorq_u32( \ + vshlq_n_u32(x,25) , vshrq_n_u32(x, 7) ),\ + vshlq_n_u32(x,14) ), vshrq_n_u32(x, 18) ),\ + vshrq_n_u32(x, 3) ) +#define LSIGMA1( x ) \ + veorq_u32( veorq_u32( veorq_u32( veorq_u32( \ + vshlq_n_u32(x,15) , vshrq_n_u32(x, 17) ),\ + vshlq_n_u32(x,13) ), vshrq_n_u32(x, 19) ),\ + vshrq_n_u32(x,10) ) + +VOID +SYMCRYPT_CALL +SymCryptParallelSha256AppendBlocks_neon( + _Inout_updates_( 4 ) PSYMCRYPT_SHA256_CHAINING_STATE * pChain, + _Inout_updates_( 4 ) PCBYTE * ppByte, + SIZE_T nBytes, + _Out_writes_( PAR_SCRATCH_ELEMENTS ) __n128 * pScratch ) +{ + // + // Implementation that uses 4 lanes in the Neon registers + // + __n128 * buf = pScratch; + __n128 * W = &buf[4 + 8]; + __n128 * ha = &buf[4]; // initial state words, in order h, g, ..., b, a + __n128 A, B, C, D, T; + __n128 T0; + int r; + + // + // This can probably be done faster, but we are missing the VTRN.64 instruction + // which makes it hard to do this efficient in intrinsics. + // The vsetq_lane_u32 seems to be completely ignored by the compiler. + // + ha[7].n128_u32[0] = pChain[0]->H[0]; + ha[7].n128_u32[1] = pChain[1]->H[0]; + ha[7].n128_u32[2] = pChain[2]->H[0]; + ha[7].n128_u32[3] = pChain[3]->H[0]; + + ha[6].n128_u32[0] = pChain[0]->H[1]; + ha[6].n128_u32[1] = pChain[1]->H[1]; + ha[6].n128_u32[2] = pChain[2]->H[1]; + ha[6].n128_u32[3] = pChain[3]->H[1]; + + ha[5].n128_u32[0] = pChain[0]->H[2]; + ha[5].n128_u32[1] = pChain[1]->H[2]; + ha[5].n128_u32[2] = pChain[2]->H[2]; + ha[5].n128_u32[3] = pChain[3]->H[2]; + + ha[4].n128_u32[0] = pChain[0]->H[3]; + ha[4].n128_u32[1] = pChain[1]->H[3]; + ha[4].n128_u32[2] = pChain[2]->H[3]; + ha[4].n128_u32[3] = pChain[3]->H[3]; + + ha[3].n128_u32[0] = pChain[0]->H[4]; + ha[3].n128_u32[1] = pChain[1]->H[4]; + ha[3].n128_u32[2] = pChain[2]->H[4]; + ha[3].n128_u32[3] = pChain[3]->H[4]; + + ha[2].n128_u32[0] = pChain[0]->H[5]; + ha[2].n128_u32[1] = pChain[1]->H[5]; + ha[2].n128_u32[2] = pChain[2]->H[5]; + ha[2].n128_u32[3] = pChain[3]->H[5]; + + ha[1].n128_u32[0] = pChain[0]->H[6]; + ha[1].n128_u32[1] = pChain[1]->H[6]; + ha[1].n128_u32[2] = pChain[2]->H[6]; + ha[1].n128_u32[3] = pChain[3]->H[6]; + + ha[0].n128_u32[0] = pChain[0]->H[7]; + ha[0].n128_u32[1] = pChain[1]->H[7]; + ha[0].n128_u32[2] = pChain[2]->H[7]; + ha[0].n128_u32[3] = pChain[3]->H[7]; + + + buf[0] = ha[4]; + buf[1] = ha[5]; + buf[2] = ha[6]; + buf[3] = ha[7]; + + while( nBytes >= 64 ) + { + + // + // Capture the input into W[0..15] + // + for( r=0; r<16; r ++ ) + { + T0.n128_u32[0] = SYMCRYPT_LOAD_MSBFIRST32( ppByte[0] ); ppByte[0] += 4; + T0.n128_u32[1] = SYMCRYPT_LOAD_MSBFIRST32( ppByte[1] ); ppByte[1] += 4; + T0.n128_u32[2] = SYMCRYPT_LOAD_MSBFIRST32( ppByte[2] ); ppByte[2] += 4; + T0.n128_u32[3] = SYMCRYPT_LOAD_MSBFIRST32( ppByte[3] ); ppByte[3] += 4; + W[r] = T0; + } + + // + // Expand the message + // + A = W[15]; + B = W[14]; + D = W[0]; + for( r=16; r<64; r+= 2 ) + { + // Loop invariant: A=W[r-1], B = W[r-2], D = W[r-16] + + // + // Macro for one word of message expansion. + // Invariant: + // on entry: a = W[r-1], b = W[r-2], d = W[r-16] + // on exit: W[r] computed, a = W[r-1], b = W[r], c = W[r-15] + // + #define EXPAND( a, b, c, d, r ) \ + c = W[r-15]; \ + b = vaddq_u32( vaddq_u32( vaddq_u32( d, LSIGMA1( b ) ), W[r-7] ), LSIGMA0( c ) ); \ + W[r] = b; \ + + EXPAND( A, B, C, D, r ); + EXPAND( B, A, D, C, (r+1)); + + #undef EXPAND + } + + A = ha[7]; + B = ha[6]; + C = ha[5]; + D = ha[4]; + + for( r=0; r<64; r += 4 ) + { + // + // Loop invariant: + // A, B, C, and D are the a,b,c,d values of the current state. + // W[r] is the next expanded message word to be processed. + // W[r-8 .. r-5] contain the current state words h, g, f, e. + // + + // + // Macro to compute one round + // + #define DO_ROUND( a, b, c, d, t, r ) \ + t = W[r]; \ + t = vaddq_u32( t, CSIGMA1( W[r-5] ) ); \ + t = vaddq_u32( t, W[r-8] ); \ + t = vaddq_u32( t, CH( W[r-5], W[r-6], W[r-7] ) ); \ + t = vaddq_u32( t, vdupq_n_u32( SymCryptSha256K[r] )); \ + W[r-4] = vaddq_u32( t, d ); \ + d = vaddq_u32( t, CSIGMA0( a ) ); \ + d = vaddq_u32( d, MAJ( c, b, a ) ); + + DO_ROUND( A, B, C, D, T, r ); + DO_ROUND( D, A, B, C, T, (r+1) ); + DO_ROUND( C, D, A, B, T, (r+2) ); + DO_ROUND( B, C, D, A, T, (r+3) ); + #undef DO_ROUND + } + + buf[3] = ha[7] = vaddq_u32( buf[3], A ); + buf[2] = ha[6] = vaddq_u32( buf[2], B ); + buf[1] = ha[5] = vaddq_u32( buf[1], C ); + buf[0] = ha[4] = vaddq_u32( buf[0], D ); + ha[3] = vaddq_u32( ha[3], W[r-5] ); + ha[2] = vaddq_u32( ha[2], W[r-6] ); + ha[1] = vaddq_u32( ha[1], W[r-7] ); + ha[0] = vaddq_u32( ha[0], W[r-8] ); + + nBytes -= 64; + } + + // + // Copy the chaining state back into the hash structure + // + + pChain[0]->H[0] = ha[7].n128_u32[0]; + pChain[1]->H[0] = ha[7].n128_u32[1]; + pChain[2]->H[0] = ha[7].n128_u32[2]; + pChain[3]->H[0] = ha[7].n128_u32[3]; + + pChain[0]->H[1] = ha[6].n128_u32[0]; + pChain[1]->H[1] = ha[6].n128_u32[1]; + pChain[2]->H[1] = ha[6].n128_u32[2]; + pChain[3]->H[1] = ha[6].n128_u32[3]; + + pChain[0]->H[2] = ha[5].n128_u32[0]; + pChain[1]->H[2] = ha[5].n128_u32[1]; + pChain[2]->H[2] = ha[5].n128_u32[2]; + pChain[3]->H[2] = ha[5].n128_u32[3]; + + pChain[0]->H[3] = ha[4].n128_u32[0]; + pChain[1]->H[3] = ha[4].n128_u32[1]; + pChain[2]->H[3] = ha[4].n128_u32[2]; + pChain[3]->H[3] = ha[4].n128_u32[3]; + + pChain[0]->H[4] = ha[3].n128_u32[0]; + pChain[1]->H[4] = ha[3].n128_u32[1]; + pChain[2]->H[4] = ha[3].n128_u32[2]; + pChain[3]->H[4] = ha[3].n128_u32[3]; + + pChain[0]->H[5] = ha[2].n128_u32[0]; + pChain[1]->H[5] = ha[2].n128_u32[1]; + pChain[2]->H[5] = ha[2].n128_u32[2]; + pChain[3]->H[5] = ha[2].n128_u32[3]; + + pChain[0]->H[6] = ha[1].n128_u32[0]; + pChain[1]->H[6] = ha[1].n128_u32[1]; + pChain[2]->H[6] = ha[1].n128_u32[2]; + pChain[3]->H[6] = ha[1].n128_u32[3]; + + pChain[0]->H[7] = ha[0].n128_u32[0]; + pChain[1]->H[7] = ha[0].n128_u32[1]; + pChain[2]->H[7] = ha[0].n128_u32[2]; + pChain[3]->H[7] = ha[0].n128_u32[3]; + + SymCryptWipeKnownSize( buf, sizeof( buf ) ); +} + +#undef CH +#undef MAJ +#undef CSIGMA0 +#undef CSIGMA1 +#undef LSIGMA0 +#undef LSIGMA1 + +#endif // CPU_X86_X64 + + + +#if SYMCRYPT_CPU_X86 || SYMCRYPT_CPU_AMD64 || SYMCRYPT_CPU_ARM + +VOID +SYMCRYPT_CALL +SymCryptParallelSha256AppendBytes_serial( + _Inout_updates_( nPar ) PSYMCRYPT_PARALLEL_HASH_SCRATCH_STATE * pWork, + _In_range_(1, MAX_PARALLEL) SIZE_T nPar, + SIZE_T nBytes ) +{ + SIZE_T i; + SIZE_T tmp; + + SYMCRYPT_ASSERT( nBytes % SYMCRYPT_SHA256_INPUT_BLOCK_SIZE == 0 ); + SYMCRYPT_ASSERT( nPar >= 1 && nPar <= MAX_PARALLEL ); + + for( i=0; i < nPar; i++ ) + { + SYMCRYPT_ASSERT( pWork[i]->cbData >= nBytes ); + SymCryptSha256AppendBlocks( & ((PSYMCRYPT_SHA256_STATE)(pWork[i]->hashState))->chain, pWork[i]->pbData, nBytes, &tmp ); + pWork[i]->pbData += nBytes; + pWork[i]->cbData -= nBytes; + } + return; +} + +VOID +SYMCRYPT_CALL +SymCryptParallelSha256Append( + _Inout_updates_( nPar ) PSYMCRYPT_PARALLEL_HASH_SCRATCH_STATE * pWork, + _In_range_(1, MAX_PARALLEL) SIZE_T nPar, + SIZE_T nBytes, + _Out_writes_to_( SYMCRYPT_SIMD_ELEMENT_SIZE * PAR_SCRATCH_ELEMENTS, 0 ) + PBYTE pbSimdScratch, + SIZE_T cbSimdScratch ) +{ + PSYMCRYPT_SHA256_CHAINING_STATE apChain[MAX_PARALLEL]; + PCBYTE apData[MAX_PARALLEL]; + SIZE_T i; + UINT32 maxParallel; + + UNREFERENCED_PARAMETER( cbSimdScratch ); // not referenced on FRE builds + SYMCRYPT_ASSERT( cbSimdScratch >= PAR_SCRATCH_ELEMENTS * SYMCRYPT_SIMD_ELEMENT_SIZE ); + SYMCRYPT_ASSERT( ((UINT_PTR)pbSimdScratch & (SYMCRYPT_SIMD_ELEMENT_SIZE - 1)) == 0 ); + + // + // Compute maxParallel; this is 4 if nPar <= 4, and 8 if nPar = 5, ..., 8. + // This is how many parameter sets we have to set up. + // +#if SYMCRYPT_CPU_X86 | SYMCRYPT_CPU_AMD64 + + maxParallel = (nPar + 3) & ~3; + SYMCRYPT_ASSERT( maxParallel == 4 || (maxParallel == 8 && SYMCRYPT_CPU_FEATURES_PRESENT( SYMCRYPT_CPU_FEATURE_AVX2 )) ); + +#elif SYMCRYPT_CPU_ARM + + maxParallel = 4; + +#endif + + SYMCRYPT_ASSERT( nPar >= 1 && nPar <= maxParallel ); + + if( nPar < MIN_PARALLEL ) + { + SymCryptParallelSha256AppendBytes_serial( pWork, nPar, nBytes ); + + // Done with this function. + goto cleanup; + } + + // + // Our parallel code expects exactly four or eight parallel computations. + // We simply duplicate the first one if we get fewer parallel ones. + // That means we write the result multiple times, but it saves a lot of + // extra if()s in the main codeline. + // + + i = 0; + while( i < nPar ) + { + SYMCRYPT_ASSERT( pWork[i]->cbData >= nBytes ); + apChain[i] = & ((PSYMCRYPT_SHA256_STATE)(pWork[i]->hashState))->chain; + apData[i] = pWork[i]->pbData; + pWork[i]->pbData += nBytes; + pWork[i]->cbData -= nBytes; + i++; + } + + while( i < maxParallel ) + { + apChain[i] = apChain[0]; + apData[i] = apData[0]; + i++; + } + +#if SYMCRYPT_CPU_X86 | SYMCRYPT_CPU_AMD64 + if( maxParallel == 8 ) + { + SymCryptParallelSha256AppendBlocks_ymm( &apChain[0], &apData[0], nBytes, (__m256i *)pbSimdScratch ); + } else { + SymCryptParallelSha256AppendBlocks_xmm( &apChain[0], &apData[0], nBytes, (__m128i *)pbSimdScratch ); + } +#elif SYMCRYPT_CPU_ARM + SymCryptParallelSha256AppendBlocks_neon( &apChain[0], &apData[0], nBytes, (__n128 *) pbSimdScratch ); +#else +#error Unknown CPU +#endif + +cleanup: + ;// no cleanup at this moment. +} + +#endif + +/* +VOID +SYMCRYPT_CALL +SymCryptParallelSha256AppendBlocks( + _Inout_updates_( nWork ) PSYMCRYPT_PARALLEL_SHA256_STATE * pWork, + SIZE_T nWork, + SIZE_T nBytes ) +{ + SIZE_T i; + + SYMCRYPT_ASSERT( nWork >= 1 && nWork <= 4 ); + + for( i=0; i < nWork; i++ ) + { + SYMCRYPT_ASSERT( pWork[i]->cbData >= nBytes ); + SymCryptSha256AppendBlocks( &pWork[i]->internalState.hashState.chain, pWork[i]->pbData, nBytes ); + pWork[i]->pbData += nBytes; + pWork[i]->cbData -= nBytes; + } +} + +*/ + +#endif // SUPPORT_PARALLEL + +#if SUPPORT_PARALLEL + +const SYMCRYPT_PARALLEL_HASH SymCryptParallelSha256Algorithm_default = { + &SymCryptSha256Algorithm_default, + PAR_SCRATCH_ELEMENTS * SYMCRYPT_SIMD_ELEMENT_SIZE, + &SymCryptParallelSha256Result1, + &SymCryptParallelSha256Result2, + &SymCryptParallelSha256ResultDone, + &SymCryptParallelSha256Append, +}; + +#else + +// +// For platforms that do not have a parallel hash implementation +// we use this structure to provide the necessary data to the _serial +// implementation of the function. +// +const SYMCRYPT_PARALLEL_HASH SymCryptParallelSha256Algorithm_default = { + &SymCryptSha256Algorithm_default, + PAR_SCRATCH_ELEMENTS * SYMCRYPT_SIMD_ELEMENT_SIZE, + NULL, + NULL, + NULL, + NULL, +}; + +#endif + +const PCSYMCRYPT_PARALLEL_HASH SymCryptParallelSha256Algorithm = &SymCryptParallelSha256Algorithm_default; + + +#define N_SELFTEST_STATES 5 // Just enough to trigger YMM useage + +VOID +SYMCRYPT_CALL +SymCryptParallelSha256Selftest() +{ + SYMCRYPT_SHA256_STATE states[N_SELFTEST_STATES]; + BYTE result[N_SELFTEST_STATES][SYMCRYPT_SHA256_RESULT_SIZE]; + SYMCRYPT_PARALLEL_HASH_OPERATION op[2*N_SELFTEST_STATES]; + BYTE scratch[SYMCRYPT_PARALLEL_SHA256_FIXED_SCRATCH + N_SELFTEST_STATES * SYMCRYPT_PARALLEL_HASH_PER_STATE_SCRATCH]; + int i; + + SymCryptParallelSha256Init( &states[0], N_SELFTEST_STATES ); + + for( i=0; i