diff --git a/crypto/ec/asm/ecp_nistz256-x86_64.pl b/crypto/ec/asm/ecp_nistz256-x86_64.pl
index 86ee3825f2f28..98b0a22cc1f3f 100755
--- a/crypto/ec/asm/ecp_nistz256-x86_64.pl
+++ b/crypto/ec/asm/ecp_nistz256-x86_64.pl
@@ -2,7 +2,14 @@
 
 ##############################################################################
 #                                                                            #
-# Copyright 2014 Intel Corporation                                           #
+# Copyright (c) 2015 Intel Corporation                                       #
+# Copyright (c) 2015 CloudFlare, Inc.                                        #
+# All rights reserved.                                                       #
+#                                                                            #
+# This software is made available to you under your choice of the            #
+# Apache V.2.0 and/or BSD license below:                                     #
+#                                                                            #
+##############################################################################
 #                                                                            #
 # Licensed under the Apache License, Version 2.0 (the "License");            #
 # you may not use this file except in compliance with the License.           #
@@ -18,10 +25,41 @@
 #                                                                            #
 ##############################################################################
 #                                                                            #
+#  Redistribution and use in source and binary forms, with or without        #
+#  modification, are permitted provided that the following conditions are    #
+#  met:                                                                      #
+#                                                                            #
+#  #  Redistributions of source code must retain the above copyright         #
+#     notice, this list of conditions and the following disclaimer.          #
+#                                                                            #
+#  #  Redistributions in binary form must reproduce the above copyright      #
+#     notice, this list of conditions and the following disclaimer in the    #
+#     documentation and/or other materials provided with the                 #
+#     distribution.                                                          #
+#                                                                            #
+#  #  Neither the name of the copyright holders nor the names of its         #
+#     contributors may be used to endorse or promote products derived from   #
+#     this software without specific prior written permission.               #
+#                                                                            #
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS       #
+#  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED #
+#  TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR#
+#  PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR         #
+#  CONTRIBUTORS  BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,    #
+#  EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,       #
+#  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR        #
+#  PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF    #
+#  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING      #
+#  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS        #
+#  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.              #
+#                                                                            #
+##############################################################################
+#                                                                            #
 #  Developers and authors:                                                   #
-#  Shay Gueron (1, 2), and Vlad Krasnov (1)                                  #
+#  Shay Gueron (1, 2), and Vlad Krasnov (1, 3)                               #
 #  (1) Intel Corporation, Israel Development Center                          #
 #  (2) University of Haifa                                                   #
+#  (3) CloudFlare, Inc.                                                      #
 #  Reference:                                                                #
 #  S.Gueron and V.Krasnov, "Fast Prime Field Elliptic Curve Cryptography with#
 #                           256 Bit Primes"                                  #
@@ -108,6 +146,13 @@
 .long 3,3,3,3,3,3,3,3
 .LONE_mont:
 .quad 0x0000000000000001, 0xffffffff00000000, 0xffffffffffffffff, 0x00000000fffffffe
+
+# Constants for computations modulo ord(p256)
+.align 64
+.Lord:
+.quad 0xf3b9cac2fc632551, 0xbce6faada7179e84, 0xffffffffffffffff, 0xffffffff00000000
+.LordK:
+.quad 0xccd1c8aaee00bc4f
 ___
 
 {
@@ -433,6 +478,981 @@
 my ($t0,$t1,$t2,$t3,$t4)=("%rcx","%rbp","%rbx","%rdx","%rax");
 my ($poly1,$poly3)=($acc6,$acc7);
 
+$code.=<<___;
+################################################################################
+# void ecp_nistz256_ord_mul_mont(
+#   uint64_t res[4],
+#   uint64_t a[4],
+#   uint64_t b[4]);
+
+.globl	ecp_nistz256_ord_mul_mont
+.type	ecp_nistz256_ord_mul_mont,\@function,3
+.align	32
+ecp_nistz256_ord_mul_mont:
+___
+$code.=<<___	if ($addx);
+	mov	\$0x80100, %ecx
+	and	OPENSSL_ia32cap_P+8(%rip), %ecx
+	cmp	\$0x80100, %ecx
+	je	ecp_nistz256_ord_mul_montx
+___
+$code.=<<___;
+	push	%rbp
+	push	%rbx
+	push	%r12
+	push	%r13
+
+	mov	$b_org, $b_ptr
+	# * b[0]
+	mov	8*0($b_ptr), $t0
+	mov	8*0($a_ptr), $t4
+	mul	$t0
+	mov	$t4, $acc0
+	mov	$t3, $acc1
+
+	mov	8*1($a_ptr), $t4
+	mul	$t0
+	add	$t4, $acc1
+	adc	\$0, $t3
+	mov	$t3, $acc2
+
+	mov	8*2($a_ptr), $t4
+	mul	$t0
+	add	$t4, $acc2
+	adc	\$0, $t3
+	mov	$t3, $acc3
+
+	mov	8*3($a_ptr), $t4
+	mul	$t0
+	add	$t4, $acc3
+	adc	\$0, $t3
+	mov	$t3, $acc4
+	xor	$acc5, $acc5
+
+	# First reduction step
+	mov	$acc0, $t4
+	mulq	.LordK(%rip)
+	mov	$t4, $t0
+
+	mov	8*0+.Lord(%rip), $t4
+	mul	$t0
+	add	$t4, $acc0
+	adc	\$0, $t3
+	mov	$t3, $t1
+
+	mov	8*1+.Lord(%rip), $t4
+	mul	$t0
+	add	$t1, $acc1
+	adc	\$0, $t3
+	add	$t4, $acc1
+
+	mov	$t0, $t1
+	adc	$t3, $acc2
+	adc	\$0, $t1
+	sub	$t0, $acc2
+	sbb	\$0, $t1
+
+	mov	8*3+.Lord(%rip), $t4
+	mul	$t0
+	add	$t1, $acc3
+	adc	\$0, $t3
+	add	$t4, $acc3
+	adc	$t3, $acc4
+	adc	\$0, $acc5
+
+	# * b[1]
+	mov	8*1($b_ptr), $t0
+
+	mov	8*0($a_ptr), $t4
+	mul	$t0
+	add	$t4, $acc1
+	adc	\$0, $t3
+	mov	$t3, $t1
+
+	mov	8*1($a_ptr), $t4
+	mul	$t0
+	add	$t1, $acc2
+	adc	\$0, $t3
+	add	$t4, $acc2
+	adc	\$0, $t3
+	mov	$t3, $t1
+
+	mov	8*2($a_ptr), $t4
+	mul	$t0
+	add	$t1, $acc3
+	adc	\$0, $t3
+	add	$t4, $acc3
+	adc	\$0, $t3
+	mov	$t3, $t1
+
+	mov	8*3($a_ptr), $t4
+	mul	$t0
+	add	$t1, $acc4
+	adc	\$0, $t3
+	add	$t4, $acc4
+	adc	$t3, $acc5
+	adc	\$0, $acc0
+	# Second reduction step
+	mov	$acc1, $t4
+	mulq	.LordK(%rip)
+	mov	$t4, $t0
+
+	mov	8*0+.Lord(%rip), $t4
+	mul	$t0
+	add	$t4, $acc1
+	adc	\$0, $t3
+	mov	$t3, $t1
+
+	mov	8*1+.Lord(%rip), $t4
+	mul	$t0
+	add	$t1, $acc2
+	adc	\$0, $t3
+	add	$t4, $acc2
+
+	mov	$t0, $t1
+	adc	$t3, $acc3
+	adc	\$0, $t1
+	sub	$t0, $acc3
+	sbb	\$0, $t1
+
+	mov	8*3+.Lord(%rip), $t4
+	mul	$t0
+	add	$t1, $acc4
+	adc	\$0, $t3
+	add	$t4, $acc4
+	adc	$t3, $acc5
+	adc	\$0, $acc0
+	# * b[2]
+	mov	8*2($b_ptr), $t0
+
+	mov	8*0($a_ptr), $t4
+	mul	$t0
+	add	$t4, $acc2
+	adc	\$0, $t3
+	mov	$t3, $t1
+
+	mov	8*1($a_ptr), $t4
+	mul	$t0
+	add	$t1, $acc3
+	adc	\$0, $t3
+	add	$t4, $acc3
+	adc	\$0, $t3
+	mov	$t3, $t1
+
+	mov	8*2($a_ptr), $t4
+	mul	$t0
+	add	$t1, $acc4
+	adc	\$0, $t3
+	add	$t4, $acc4
+	adc	\$0, $t3
+	mov	$t3, $t1
+
+	mov	8*3($a_ptr), $t4
+	mul	$t0
+	add	$t1, $acc5
+	adc	\$0, $t3
+	add	$t4, $acc5
+	adc	$t3, $acc0
+	adc	\$0, $acc1
+	# Third reduction step
+	mov	$acc2, $t4
+	mulq	.LordK(%rip)
+	mov	$t4, $t0
+
+	mov	8*0+.Lord(%rip), $t4
+	mul	$t0
+	add	$t4, $acc2
+	adc	\$0, $t3
+	mov	$t3, $t1
+
+	mov	8*1+.Lord(%rip), $t4
+	mul	$t0
+	add	$t1, $acc3
+	adc	\$0, $t3
+	add	$t4, $acc3
+
+	mov	$t0, $t1
+	adc	$t3, $acc4
+	adc	\$0, $t1
+	sub	$t0, $acc4
+	sbb	\$0, $t1
+
+	mov	8*3+.Lord(%rip), $t4
+	mul	$t0
+	add	$t1, $acc5
+	adc	\$0, $t3
+	add	$t4, $acc5
+	adc	$t3, $acc0
+	adc	\$0, $acc1
+	# * b[3]
+	mov	8*3($b_ptr), $t0
+
+	mov	8*0($a_ptr), $t4
+	mul	$t0
+	add	$t4, $acc3
+	adc	\$0, $t3
+	mov	$t3, $t1
+
+	mov	8*1($a_ptr), $t4
+	mul	$t0
+	add	$t1, $acc4
+	adc	\$0, $t3
+	add	$t4, $acc4
+	adc	\$0, $t3
+	mov	$t3, $t1
+
+	mov	8*2($a_ptr), $t4
+	mul	$t0
+	add	$t1, $acc5
+	adc	\$0, $t3
+	add	$t4, $acc5
+	adc	\$0, $t3
+	mov	$t3, $t1
+
+	mov	8*3($a_ptr), $t4
+	mul	$t0
+	add	$t1, $acc0
+	adc	\$0, $t3
+	add	$t4, $acc0
+	adc	$t3, $acc1
+	adc	\$0, $acc2
+	# Last reduction step
+	mov	$acc3, $t4
+	mulq	.LordK(%rip)
+	mov	$t4, $t0
+
+	mov	8*0+.Lord(%rip), $t4
+	mul	$t0
+	add	$t4, $acc3
+	adc	\$0, $t3
+	mov	$t3, $t1
+
+	mov	8*1+.Lord(%rip), $t4
+	mul	$t0
+	add	$t1, $acc4
+	adc	\$0, $t3
+	add	$t4, $acc4
+
+	mov	$t0, $t1
+	adc	$t3, $acc5
+	adc	\$0, $t1
+	sub	$t0, $acc5
+	sbb	\$0, $t1
+
+	mov	8*3+.Lord(%rip), $t4
+	mul	$t0
+	add	$t1, $acc0
+	adc	\$0, $t3
+	add	$t4, $acc0
+	adc	$t3, $acc1
+	adc	\$0, $acc2
+
+	# Copy result [255:0]
+	mov	$acc4, $a_ptr
+	mov	$acc5, $acc3
+	mov	$acc0, $t0
+	mov	$acc1, $t1
+	# Subtract ord
+	sub	8*0+.Lord(%rip), $acc4
+	sbb	8*1+.Lord(%rip), $acc5
+	sbb	8*2+.Lord(%rip), $acc0
+	sbb	8*3+.Lord(%rip), $acc1
+	sbb	\$0, $acc2
+
+	cmovc	$a_ptr, $acc4
+	cmovc	$acc3, $acc5
+	cmovc	$t0, $acc0
+	cmovc	$t1, $acc1
+
+	mov	$acc4, 8*0($r_ptr)
+	mov	$acc5, 8*1($r_ptr)
+	mov	$acc0, 8*2($r_ptr)
+	mov	$acc1, 8*3($r_ptr)
+
+	pop	%r13
+	pop	%r12
+	pop	%rbx
+	pop	%rbp
+	ret
+.size	ecp_nistz256_ord_mul_mont,.-ecp_nistz256_ord_mul_mont
+___
+$code.=<<___	if ($addx);
+################################################################################
+.align	32
+ecp_nistz256_ord_mul_montx:
+	push	%rbp
+	push	%rbx
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+
+	mov	$b_org, $b_ptr
+	mov	8*0($b_org), %rdx
+	mov	8*0($a_ptr), $acc1
+	mov	8*1($a_ptr), $acc2
+	mov	8*2($a_ptr), $acc3
+	mov	8*3($a_ptr), $acc4
+	lea	-128($a_ptr), $a_ptr	# control u-op density
+
+	# Multiply by b[0]
+	mulx	$acc1, $acc0, $acc1
+	mulx	$acc2, $t0, $acc2
+	xor	$acc5, $acc5		# cf=0
+	mulx	$acc3, $t1, $acc3
+	adc	$t0, $acc1
+	mulx	$acc4, $t0, $acc4
+	 mov	$acc0, %rdx
+	mulx	.LordK(%rip), %rdx, $t4
+	adc	$t1, $acc2
+	adc	$t0, $acc3
+	adc	\$0, $acc4
+
+	########################################################################
+	xor %eax, %eax
+	mulx	8*0+.Lord(%rip), $t0, $t1
+	adcx	$t0, $acc0
+	adox	$t1, $acc1
+	mulx	8*1+.Lord(%rip), $t0, $t1
+	adcx	$t0, $acc1
+	adox	$t1, $acc2
+	mulx	8*2+.Lord(%rip), $t0, $t1
+	adcx	$t0, $acc2
+	adox	$t1, $acc3
+	mulx	8*3+.Lord(%rip), $t0, $t1
+	adcx	$t0, $acc3
+	adox	$t1, $acc4
+	mov	8*1($b_ptr), %rdx
+	adcx	%rax, $acc4
+	adox	%rax, $acc5
+	adc	\$0, $acc5
+	xor	$acc0 ,$acc0
+	########################################################################
+	# Multiply by b[1]
+	mulx	8*0+128($a_ptr), $t0, $t1
+	adcx	$t0, $acc1
+	adox	$t1, $acc2
+
+	mulx	8*1+128($a_ptr), $t0, $t1
+	adcx	$t0, $acc2
+	adox	$t1, $acc3
+
+	mulx	8*2+128($a_ptr), $t0, $t1
+	adcx	$t0, $acc3
+	adox	$t1, $acc4
+
+	mulx	8*3+128($a_ptr), $t0, $t1
+	 mov	$acc1, %rdx
+	mulx	.LordK(%rip), %rdx, $t4
+	adcx	$t0, $acc4
+	adox	$t1, $acc5
+
+	adcx	$acc0, $acc5
+	adox	$acc0, $acc0
+	adc	\$0, $acc0
+	########################################################################
+	xor	%eax, %eax
+	mulx	8*0+.Lord(%rip), $t0, $t1
+	adcx	$t0, $acc1
+	adox	$t1, $acc2
+	mulx	8*1+.Lord(%rip), $t0, $t1
+	adcx	$t0, $acc2
+	adox	$t1, $acc3
+	mulx	8*2+.Lord(%rip), $t0, $t1
+	adcx	$t0, $acc3
+	adox	$t1, $acc4
+	mulx	8*3+.Lord(%rip), $t0, $t1
+	adcx	$t0, $acc4
+	adox	$t1, $acc5
+	mov	8*2($b_ptr), %rdx
+	adcx	%rax, $acc5
+	adox	%rax, $acc0
+	adc	\$0, $acc0
+	xor	$acc1 ,$acc1		# $acc1=0,cf=0,of=0
+	########################################################################
+	# Multiply by b[2]
+	mulx	8*0+128($a_ptr), $t0, $t1
+	adcx	$t0, $acc2
+	adox	$t1, $acc3
+
+	mulx	8*1+128($a_ptr), $t0, $t1
+	adcx	$t0, $acc3
+	adox	$t1, $acc4
+
+	mulx	8*2+128($a_ptr), $t0, $t1
+	adcx	$t0, $acc4
+	adox	$t1, $acc5
+
+	mulx	8*3+128($a_ptr), $t0, $t1
+	 mov	$acc2, %rdx
+	mulx	.LordK(%rip), %rdx, $t4
+	adcx	$t0, $acc5
+	adox	$t1, $acc0
+
+	adcx	$acc1, $acc0
+	adox	$acc1, $acc1
+	adc	\$0, $acc1
+
+	########################################################################
+	xor	%eax, %eax
+	mulx	8*0+.Lord(%rip), $t0, $t1
+	adcx	$t0, $acc2
+	adox	$t1, $acc3
+	mulx	8*1+.Lord(%rip), $t0, $t1
+	adcx	$t0, $acc3
+	adox	$t1, $acc4
+	mulx	8*2+.Lord(%rip), $t0, $t1
+	adcx	$t0, $acc4
+	adox	$t1, $acc5
+	mulx	8*3+.Lord(%rip), $t0, $t1
+	adcx	$t0, $acc5
+	adox	$t1, $acc0
+	mov	8*3($b_ptr), %rdx
+	adcx	%rax, $acc0
+	adox	%rax, $acc1
+	adc	\$0, $acc1
+	xor	$acc2 ,$acc2		# $acc2=0,cf=0,of=0
+	########################################################################
+	# Multiply by b[3]
+	mulx	8*0+128($a_ptr), $t0, $t1
+	adcx	$t0, $acc3
+	adox	$t1, $acc4
+
+	mulx	8*1+128($a_ptr), $t0, $t1
+	adcx	$t0, $acc4
+	adox	$t1, $acc5
+
+	mulx	8*2+128($a_ptr), $t0, $t1
+	adcx	$t0, $acc5
+	adox	$t1, $acc0
+
+	mulx	8*3+128($a_ptr), $t0, $t1
+	 mov	$acc3, %rdx
+	mulx	.LordK(%rip), %rdx, $t4
+	adcx	$t0, $acc0
+	adox	$t1, $acc1
+
+	adcx	$acc2, $acc1
+	adox	$acc2, $acc2
+	adc	\$0, $acc2
+
+	########################################################################
+	xor	%eax, %eax
+	mulx	8*0+.Lord(%rip), $t0, $t1
+	adcx	$t0, $acc3
+	adox	$t1, $acc4
+	mulx	8*1+.Lord(%rip), $t0, $t1
+	adcx	$t0, $acc4
+	adox	$t1, $acc5
+	mulx	8*2+.Lord(%rip), $t0, $t1
+	adcx	$t0, $acc5
+	adox	$t1, $acc0
+	mulx	8*3+.Lord(%rip), $t0, $t1
+	adcx	$t0, $acc0
+	adox	$t1, $acc1
+	adcx	%rax, $acc1
+	adox	%rax, $acc2
+	adc	\$0, $acc2
+
+	########################################################################
+	# Branch-less conditional subtraction of P
+	xor	%eax, %eax
+	 mov	$acc4, $t2
+	 mov	$acc5, $t3
+	 mov	$acc0, $t0
+	 mov	$acc1, $t1
+	sbb	8*0+.Lord(%rip), $acc4		# .Lpoly[0]
+	sbb	8*1+.Lord(%rip), $acc5		# .Lpoly[1]
+	sbb	8*2+.Lord(%rip), $acc0		# .Lpoly[1]
+	sbb	8*3+.Lord(%rip), $acc1		# .Lpoly[1]
+	sbb	\$0, $acc2
+
+	cmovc	$t2, $acc4
+	cmovc	$t3, $acc5
+	mov	$acc4, 8*0($r_ptr)
+	cmovc	$t0, $acc0
+	mov	$acc5, 8*1($r_ptr)
+	cmovc	$t1, $acc1
+	mov	$acc0, 8*2($r_ptr)
+	mov	$acc1, 8*3($r_ptr)
+
+	pop	%r15
+	pop	%r14
+	pop	%r13
+	pop	%r12
+	pop	%rbx
+	pop	%rbp
+	ret
+.size	ecp_nistz256_ord_mul_montx,.-ecp_nistz256_ord_mul_montx
+################################################################################
+___
+$code.=<<___;
+# void ecp_nistz256_ord_sqr_mont(
+#   uint64_t res[4],
+#   uint64_t a[4],
+#   int rep);
+
+.globl	ecp_nistz256_ord_sqr_mont
+.type	ecp_nistz256_ord_sqr_mont,\@function,3
+.align	32
+ecp_nistz256_ord_sqr_mont:
+
+___
+$code.=<<___	if ($addx);
+	mov	\$0x80100, %ecx
+	and	OPENSSL_ia32cap_P+8(%rip), %ecx
+	cmp	\$0x80100, %ecx
+	je  ecp_nistz256_ord_sqr_montx
+___
+$code.=<<___;
+	push	%rbp
+	push	%rbx
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+
+	mov	.LordK(%rip), %r15
+
+	mov	$b_org, %r14
+
+.Lord_sqr_loop:
+	# y[1:] * y[0]
+	mov	8*0($a_ptr), $t0
+
+	mov	8*1($a_ptr), $t4
+	mul	$t0
+	mov	$t4, $acc1
+	mov	$t3, $acc2
+
+	mov	8*2($a_ptr), $t4
+	mul	$t0
+	add	$t4, $acc2
+	adc	\$0, $t3
+	mov	$t3, $acc3
+
+	mov	8*3($a_ptr), $t4
+	mul	$t0
+	add	$t4, $acc3
+	adc	\$0, $t3
+	mov	$t3, $acc4
+	# y[2:] * y[1]
+	mov	8*1($a_ptr), $t0
+
+	mov	8*2($a_ptr), $t4
+	mul	$t0
+	add	$t4, $acc3
+	adc	\$0, $t3
+	mov	$t3, $t1
+
+	mov	8*3($a_ptr), $t4
+	mul	$t0
+	add	$t1, $acc4
+	adc	\$0, $t3
+	add	$t4, $acc4
+	adc	\$0, $t3
+	mov	$t3, $acc5
+	# y[3] * y[2]
+	mov	8*2($a_ptr), $t0
+
+	mov	8*3($a_ptr), $t4
+	mul	$t0
+	add	$t4, $acc5
+	adc	\$0, $t3
+	mov	$t3, $b_ptr
+	xor	$t1, $t1
+	# *2
+	add	$acc1, $acc1
+	adc	$acc2, $acc2
+	adc	$acc3, $acc3
+	adc	$acc4, $acc4
+	adc	$acc5, $acc5
+	adc	$b_ptr, $b_ptr
+	adc	\$0, $t1
+	# Missing products
+	mov	8*0($a_ptr), $t4
+	mul	$t4
+	mov	$t4, $acc0
+	mov	$t3, $t0
+
+	mov	8*1($a_ptr), $t4
+	mul	$t4
+	add	$t0, $acc1
+	adc	$t4, $acc2
+	adc	\$0, $t3
+	mov	$t3, $t0
+
+	mov	8*2($a_ptr), $t4
+	mul	$t4
+	add	$t0, $acc3
+	adc	$t4, $acc4
+	adc	\$0, $t3
+	mov	$t3, $t0
+
+	mov	8*3($a_ptr), $t4
+	mul	$t4
+	add	$t0, $acc5
+	adc	$t4, $b_ptr
+	adc	$t3, $t1
+	mov	$t1, $a_ptr
+
+	# First reduction step
+	mov	$acc0, $t4
+	mulq	%r15
+	mov	$t4, $t0
+
+	mov	8*0+.Lord(%rip), $t4
+	mul	$t0
+	add	$t4, $acc0
+	adc	\$0, $t3
+	mov	$t3, $t1
+
+	mov	8*1+.Lord(%rip), $t4
+	mul	$t0
+	add	$t1, $acc1
+	adc	\$0, $t3
+	add	$t4, $acc1
+
+	mov	$t0, $t1
+	adc	$t3, $acc2
+	adc	\$0, $t1
+	sub	$t0, $acc2
+	sbb	\$0, $t1
+
+	mov	$t0, $t4
+	mov	$t0, $t3
+	mov	$t0, $acc0
+	shl	\$32, $t4
+	shr	\$32, $t3
+
+	add	$t1, $acc3
+	adc	\$0, $acc0
+	sub	$t4, $acc3
+	sbb	$t3, $acc0
+
+	# Second reduction step
+	mov	$acc1, $t4
+	mulq	%r15
+	mov	$t4, $t0
+
+	mov	8*0+.Lord(%rip), $t4
+	mul	$t0
+	add	$t4, $acc1
+	adc	\$0, $t3
+	mov	$t3, $t1
+
+	mov	8*1+.Lord(%rip), $t4
+	mul	$t0
+	add	$t1, $acc2
+	adc	\$0, $t3
+	add	$t4, $acc2
+
+	mov	$t0, $t1
+	adc	$t3, $acc3
+	adc	\$0, $t1
+	sub	$t0, $acc3
+	sbb	\$0, $t1
+
+	mov	$t0, $t4
+	mov	$t0, $t3
+	mov	$t0, $acc1
+	shl	\$32, $t4
+	shr	\$32, $t3
+
+	add	$t1, $acc0
+	adc	\$0, $acc1
+	sub	$t4, $acc0
+	sbb	$t3, $acc1
+
+	# Third reduction step
+	mov	$acc2, $t4
+	mulq	%r15
+	mov	$t4, $t0
+
+	mov	8*0+.Lord(%rip), $t4
+	mul	$t0
+	add	$t4, $acc2
+	adc	\$0, $t3
+	mov	$t3, $t1
+
+	mov	8*1+.Lord(%rip), $t4
+	mul	$t0
+	add	$t1, $acc3
+	adc	\$0, $t3
+	add	$t4, $acc3
+
+	mov	$t0, $t1
+	adc	$t3, $acc0
+	adc	\$0, $t1
+	sub	$t0, $acc0
+	sbb	\$0, $t1
+
+	mov	$t0, $t4
+	mov	$t0, $t3
+	mov	$t0, $acc2
+	shl	\$32, $t4
+	shr	\$32, $t3
+
+	add	$t1, $acc1
+	adc	\$0, $acc2
+	sub	$t4, $acc1
+	sbb	$t3, $acc2
+
+	# Last reduction step
+	mov	$acc3, $t4
+	mulq	%r15
+	mov	$t4, $t0
+
+	mov	8*0+.Lord(%rip), $t4
+	mul	$t0
+	add	$t4, $acc3
+	adc	\$0, $t3
+	mov	$t3, $t1
+
+	mov	8*1+.Lord(%rip), $t4
+	mul	$t0
+	add	$t1, $acc0
+	adc	\$0, $t3
+	add	$t4, $acc0
+
+	mov	$t0, $t1
+	adc	$t3, $acc1
+	adc	\$0, $t1
+	sub	$t0, $acc1
+	sbb	\$0, $t1
+
+	mov	$t0, $t4
+	mov	$t0, $acc3
+	shl	\$32, $t4
+	shr	\$32, $t0
+
+	add	$t1, $acc2
+	adc	\$0, $acc3
+	sub	$t4, $acc2
+	sbb	$t0, $acc3
+	xor	$t0, $t0
+	# Add bits [511:256] of	the sqr result
+	add	$acc4, $acc0
+	adc	$acc5, $acc1
+	adc	$b_ptr, $acc2
+	adc	$a_ptr, $acc3
+	adc	\$0, $t0
+
+	mov	$acc0, $acc4
+	mov	$acc1, $acc5
+	mov	$acc2, $b_ptr
+	mov	$acc3, $t1
+	# Subtract p256
+	sub	8*0+.Lord(%rip), $acc0
+	sbb	8*1+.Lord(%rip), $acc1
+	sbb	8*2+.Lord(%rip), $acc2
+	sbb	8*3+.Lord(%rip), $acc3
+	sbb	\$0, $t0
+
+	cmovc	$acc4, $acc0
+	cmovc	$acc5, $acc1
+	cmovc	$b_ptr, $acc2
+	cmovc	$t1, $acc3
+
+	mov	$acc0, 8*0($r_ptr)
+	mov	$acc1, 8*1($r_ptr)
+	mov	$acc2, 8*2($r_ptr)
+	mov	$acc3, 8*3($r_ptr)
+	mov	$r_ptr, $a_ptr
+	dec	%r14
+	jne	.Lord_sqr_loop
+
+	pop	%r15
+	pop	%r14
+	pop	%r13
+	pop	%r12
+	pop	%rbx
+	pop	%rbp
+	ret
+.size	ecp_nistz256_ord_sqr_mont,.-ecp_nistz256_ord_sqr_mont
+___
+$code.=<<___	if ($addx);
+.align	32
+ecp_nistz256_ord_sqr_montx:
+
+	push	%rbp
+	push	%rbx
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+
+	mov	$b_org, $t2
+	lea	-128($a_ptr), $a_ptr	# control u-op density
+.Lord_sqrx_loop:
+	mov	8*0+128($a_ptr), %rdx
+	mov	8*1+128($a_ptr), $acc6
+	mov	8*2+128($a_ptr), $acc7
+	mov	8*3+128($a_ptr), $acc0
+
+	mulx	$acc6, $acc1, $acc2	# a[0]*a[1]
+	mulx	$acc7, $t0, $acc3	# a[0]*a[2]
+	xor	%eax, %eax
+	adc	$t0, $acc2
+	mulx	$acc0, $t1, $acc4	# a[0]*a[3]
+	 mov	$acc6, %rdx
+	adc	$t1, $acc3
+	adc	\$0, $acc4
+	xor	$acc5, $acc5		# $acc5=0,cf=0,of=0
+	#################################
+	mulx	$acc7, $t0, $t1		# a[1]*a[2]
+	adcx	$t0, $acc3
+	adox	$t1, $acc4
+
+	mulx	$acc0, $t0, $t1		# a[1]*a[3]
+	 mov	$acc7, %rdx
+	adcx	$t0, $acc4
+	adox	$t1, $acc5
+	adc	\$0, $acc5
+	#################################
+	mulx	$acc0, $t0, $acc6	# a[2]*a[3]
+	 mov	8*0+128($a_ptr), %rdx
+	xor	$acc7, $acc7		# $acc7=0,cf=0,of=0
+	 adcx	$acc1, $acc1		# acc1:6<<1
+	adox	$t0, $acc5
+	 adcx	$acc2, $acc2
+	adox	$acc7, $acc6		# of=0
+
+	mulx	%rdx, $acc0, $t1
+	mov	8*1+128($a_ptr), %rdx
+	 adcx	$acc3, $acc3
+	adox	$t1, $acc1
+	 adcx	$acc4, $acc4
+	mulx	%rdx, $t0, $t4
+	mov	8*2+128($a_ptr), %rdx
+	 adcx	$acc5, $acc5
+	adox	$t0, $acc2
+	 adcx	$acc6, $acc6
+	.byte	0x67
+	mulx	%rdx, $t0, $t1
+	mov	8*3+128($a_ptr), %rdx
+	adox	$t4, $acc3
+	 adcx	$acc7, $acc7
+	adox	$t0, $acc4
+	adox	$t1, $acc5
+	.byte	0x67,0x67
+	mulx	%rdx, $t0, $t4
+	adox	$t0, $acc6
+	adox	$t4, $acc7
+
+	#reduce
+	mov	$acc0, %rdx
+	mulx	.LordK(%rip), %rdx, $t0
+
+	xor	%eax, %eax
+	mulx	8*0+.Lord(%rip), $t0, $t1
+	adcx	$t0, $acc0
+	adox	$t1, $acc1
+	mulx	8*1+.Lord(%rip), $t0, $t1
+	adcx	$t0, $acc1
+	adox	$t1, $acc2
+	mulx	8*2+.Lord(%rip), $t0, $t1
+	adcx	$t0, $acc2
+	adox	$t1, $acc3
+	mulx	8*3+.Lord(%rip), $t0, $t1
+	adcx	$t0, $acc3
+	adox	$t1, $acc0
+	adcx	%rax, $acc0
+	#################################
+	mov	$acc1, %rdx
+	mulx	.LordK(%rip), %rdx, $t0
+
+	mulx	8*0+.Lord(%rip), $t0, $t1
+	adcx	$t0, $acc1
+	adox	$t1, $acc2
+	mulx	8*1+.Lord(%rip), $t0, $t1
+	adcx	$t0, $acc2
+	adox	$t1, $acc3
+	mulx	8*2+.Lord(%rip), $t0, $t1
+	adcx	$t0, $acc3
+	adox	$t1, $acc0
+	mulx	8*3+.Lord(%rip), $t0, $t1
+	adcx	$t0, $acc0
+	adox	$t1, $acc1
+	adcx	%rax, $acc1
+	#################################
+	mov	$acc2, %rdx
+	mulx	.LordK(%rip), %rdx, $t0
+
+	mulx	8*0+.Lord(%rip), $t0, $t1
+	adcx	$t0, $acc2
+	adox	$t1, $acc3
+	mulx	8*1+.Lord(%rip), $t0, $t1
+	adcx	$t0, $acc3
+	adox	$t1, $acc0
+	mulx	8*2+.Lord(%rip), $t0, $t1
+	adcx	$t0, $acc0
+	adox	$t1, $acc1
+	mulx	8*3+.Lord(%rip), $t0, $t1
+	adcx	$t0, $acc1
+	adox	$t1, $acc2
+	adcx	%rax, $acc2
+	#################################
+	mov	$acc3, %rdx
+	mulx	.LordK(%rip), %rdx, $t0
+
+	mulx	8*0+.Lord(%rip), $t0, $t1
+	adcx	$t0, $acc3
+	adox	$t1, $acc0
+	mulx	8*1+.Lord(%rip), $t0, $t1
+	adcx	$t0, $acc0
+	adox	$t1, $acc1
+	mulx	8*2+.Lord(%rip), $t0, $t1
+	adcx	$t0, $acc1
+	adox	$t1, $acc2
+	mulx	8*3+.Lord(%rip), $t0, $t1
+	adcx	$t0, $acc2
+	adox	$t1, $acc3
+	adcx	%rax, $acc3
+
+	xor	$t0, $t0
+	add	$acc4, $acc0
+	adc	$acc5, $acc1
+	adc	$acc6, $acc2
+	adc	$acc7, $acc3
+	adc	\$0, $t0
+
+	mov	$acc0, $acc4
+	mov	$acc1, $acc5
+	mov	$acc2, $acc6
+	mov	$acc3, $acc7
+	# Subtract p256
+	sub	8*0+.Lord(%rip), $acc0
+	sbb	8*1+.Lord(%rip), $acc1
+	sbb	8*2+.Lord(%rip), $acc2
+	sbb	8*3+.Lord(%rip), $acc3
+	sbb	\$0, $t0
+
+	cmovc	$acc4, $acc0
+	cmovc	$acc5, $acc1
+	cmovc	$acc6, $acc2
+	cmovc	$acc7, $acc3
+
+	mov	$acc0, 8*0($r_ptr)
+	mov	$acc1, 8*1($r_ptr)
+	mov	$acc2, 8*2($r_ptr)
+	mov	$acc3, 8*3($r_ptr)
+
+	lea	-128($r_ptr), $a_ptr
+
+	dec	$t2
+	jne	.Lord_sqrx_loop
+
+	pop	%r15
+	pop	%r14
+	pop	%r13
+	pop	%r12
+	pop	%rbx
+	pop	%rbp
+	ret
+
+.size	ecp_nistz256_ord_sqr_montx,.-ecp_nistz256_ord_sqr_montx
+___
 $code.=<<___;
 ################################################################################
 # void ecp_nistz256_to_mont(
diff --git a/crypto/ec/ec_err.c b/crypto/ec/ec_err.c
index 13b32c78ac744..e8286899179d9 100644
--- a/crypto/ec/ec_err.c
+++ b/crypto/ec/ec_err.c
@@ -190,6 +190,7 @@ static ERR_STRING_DATA EC_str_functs[] = {
     {ERR_FUNC(EC_F_EC_GROUP_SET_CURVE_GFP), "EC_GROUP_set_curve_GFp"},
     {ERR_FUNC(EC_F_EC_GROUP_SET_EXTRA_DATA), "EC_GROUP_SET_EXTRA_DATA"},
     {ERR_FUNC(EC_F_EC_GROUP_SET_GENERATOR), "EC_GROUP_set_generator"},
+    {ERR_FUNC(EC_F_EC_GROUP_ORD_INVERSE), "EC_GROUP_do_inverse_ord"},
     {ERR_FUNC(EC_F_EC_KEY_CHECK_KEY), "EC_KEY_check_key"},
     {ERR_FUNC(EC_F_EC_KEY_COPY), "EC_KEY_copy"},
     {ERR_FUNC(EC_F_EC_KEY_GENERATE_KEY), "EC_KEY_generate_key"},
@@ -245,6 +246,7 @@ static ERR_STRING_DATA EC_str_functs[] = {
     {ERR_FUNC(EC_F_ECP_NISTZ256_MULT_PRECOMPUTE),
      "ecp_nistz256_mult_precompute"},
     {ERR_FUNC(EC_F_ECP_NISTZ256_PRE_COMP_NEW), "ecp_nistz256_pre_comp_new"},
+    {ERR_FUNC(EC_F_ECP_NISTZ256_INV_ORD), "ecp_nistz256_inv_mod_ord"},
     {ERR_FUNC(EC_F_O2I_ECPUBLICKEY), "o2i_ECPublicKey"},
     {ERR_FUNC(EC_F_OLD_EC_PRIV_DECODE), "OLD_EC_PRIV_DECODE"},
     {ERR_FUNC(EC_F_PKEY_EC_CTRL), "PKEY_EC_CTRL"},
diff --git a/crypto/ec/ec_lcl.h b/crypto/ec/ec_lcl.h
index 9db7106c5aaa7..b79ee35992460 100644
--- a/crypto/ec/ec_lcl.h
+++ b/crypto/ec/ec_lcl.h
@@ -196,6 +196,9 @@ struct ec_method_st {
     int (*field_decode) (const EC_GROUP *, BIGNUM *r, const BIGNUM *a,
                          BN_CTX *);
     int (*field_set_to_one) (const EC_GROUP *, BIGNUM *r, BN_CTX *);
+
+    /* Inverse modulo order */
+    int (*field_inverse_mod_ord) (const EC_GROUP *, BIGNUM *r, const BIGNUM *x, BN_CTX *ctx);
 } /* EC_METHOD */ ;
 
 typedef struct ec_extra_data_st {
diff --git a/crypto/ec/ec_lib.c b/crypto/ec/ec_lib.c
index 9156943e200a0..80d29fdcf9953 100644
--- a/crypto/ec/ec_lib.c
+++ b/crypto/ec/ec_lib.c
@@ -332,6 +332,47 @@ int EC_GROUP_get_order(const EC_GROUP *group, BIGNUM *order, BN_CTX *ctx)
     return !BN_is_zero(order);
 }
 
+int EC_GROUP_do_inverse_ord(const EC_GROUP *group,
+                            BIGNUM *res,
+                            const BIGNUM *x,
+                            BN_CTX *ctx,
+                            int constantTime)
+{
+    int ret = 0;
+    BN_CTX_start(ctx);
+    if (group->meth->field_inverse_mod_ord != NULL) {
+        ret = group->meth->field_inverse_mod_ord(group, res, x, ctx);
+    } else if (constantTime) {
+        BIGNUM *tmp;
+        if ((tmp = BN_CTX_get(ctx)) == NULL) {
+            ECDSAerr(EC_F_EC_GROUP_ORD_INVERSE, ERR_R_BN_LIB);
+            goto err;
+        }
+        if (!BN_set_word(tmp, 2)) {
+            ECDSAerr(EC_F_EC_GROUP_ORD_INVERSE, ERR_R_BN_LIB);
+            goto err;
+        }
+        if (!BN_mod_sub(tmp, group->order, tmp, group->order, ctx)) {
+            ECDSAerr(EC_F_EC_GROUP_ORD_INVERSE, ERR_R_BN_LIB);
+            goto err;
+        }
+        BN_set_flags(tmp, BN_FLG_CONSTTIME);
+        if (!BN_mod_exp_mont_consttime
+            (res, x, tmp, group->order, ctx, EC_GROUP_get_mont_data(group))) {
+            ECDSAerr(EC_F_EC_GROUP_ORD_INVERSE, ERR_R_BN_LIB);
+            goto err;
+        }
+        ret = 1;
+    } else {
+        if (BN_mod_inverse(res, x, group->order, ctx)) {
+            ret = 1;
+        }
+    }
+err:
+    BN_CTX_end(ctx);
+    return ret;
+}
+
 int EC_GROUP_get_cofactor(const EC_GROUP *group, BIGNUM *cofactor,
                           BN_CTX *ctx)
 {
diff --git a/crypto/ec/ecp_nistz256.c b/crypto/ec/ecp_nistz256.c
index 83f0c6fdfdce6..38001e05d0ad0 100644
--- a/crypto/ec/ecp_nistz256.c
+++ b/crypto/ec/ecp_nistz256.c
@@ -1,6 +1,13 @@
 /******************************************************************************
  *                                                                            *
- * Copyright 2014 Intel Corporation                                           *
+ * Copyright (c) 2015 Intel Corporation                                       *
+ * Copyright (c) 2015 CloudFlare, Inc.                                        *
+ * All rights reserved.                                                       *
+ *                                                                            *
+ * This software is made available to you under your choice of the            *
+ * Apache V.2.0 and/or BSD license below:                                     *
+ *                                                                            *
+ ******************************************************************************
  *                                                                            *
  * Licensed under the Apache License, Version 2.0 (the "License");            *
  * you may not use this file except in compliance with the License.           *
@@ -16,10 +23,41 @@
  *                                                                            *
  ******************************************************************************
  *                                                                            *
+ *  Redistribution and use in source and binary forms, with or without        *
+ *  modification, are permitted provided that the following conditions are    *
+ *  met:                                                                      *
+ *                                                                            *
+ *  1. Redistributions of source code must retain the above copyright         *
+ *     notice, this list of conditions and the following disclaimer.          *
+ *                                                                            *
+ *  2. Redistributions in binary form must reproduce the above copyright      *
+ *     notice, this list of conditions and the following disclaimer in the    *
+ *     documentation and/or other materials provided with the                 *
+ *     distribution.                                                          *
+ *                                                                            *
+ *  3. Neither the name of the copyright holders nor the names of its         *
+ *     contributors may be used to endorse or promote products derived from   *
+ *     this software without specific prior written permission.               *
+ *                                                                            *
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS       *
+ *  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED *
+ *  TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR*
+ *  PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR         *
+ *  CONTRIBUTORS  BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,    *
+ *  EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,       *
+ *  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR        *
+ *  PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF    *
+ *  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING      *
+ *  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS        *
+ *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.              *
+ *                                                                            *
+ ******************************************************************************
+ *                                                                            *
  * Developers and authors:                                                    *
- * Shay Gueron (1, 2), and Vlad Krasnov (1)                                   *
+ * Shay Gueron (1, 2), and Vlad Krasnov (1, 3)                                *
  * (1) Intel Corporation, Israel Development Center                           *
  * (2) University of Haifa                                                    *
+ * (3) CloudFlare, Inc.                                                       *
  * Reference:                                                                 *
  * S.Gueron and V.Krasnov, "Fast Prime Field Elliptic Curve Cryptography with *
  *                          256 Bit Primes"                                   *
@@ -102,6 +140,13 @@ void ecp_nistz256_neg(BN_ULONG res[P256_LIMBS], const BN_ULONG a[P256_LIMBS]);
 void ecp_nistz256_mul_mont(BN_ULONG res[P256_LIMBS],
                            const BN_ULONG a[P256_LIMBS],
                            const BN_ULONG b[P256_LIMBS]);
+/* Montgomery mul modulo Order(P): res = a*b*2^-256 mod Order(P) */
+void ecp_nistz256_ord_mul_mont(BN_ULONG res[P256_LIMBS],
+                           const BN_ULONG a[P256_LIMBS],
+                           const BN_ULONG b[P256_LIMBS]);
+void ecp_nistz256_ord_sqr_mont(BN_ULONG res[P256_LIMBS],
+                           const BN_ULONG a[P256_LIMBS],
+                           int rep);
 /* Montgomery sqr: res = a*a*2^-256 mod P */
 void ecp_nistz256_sqr_mont(BN_ULONG res[P256_LIMBS],
                            const BN_ULONG a[P256_LIMBS]);
@@ -1357,9 +1402,109 @@ __owur static int ecp_nistz256_points_mul(const EC_GROUP *group,
     return ret;
 }
 
+#if defined(__x86_64) || defined(__x86_64__)
+__owur int ecp_nistz256_inv_mod_ord(const EC_GROUP *group,
+                                    BIGNUM *r,
+                                    const BIGNUM *x,
+                                    BN_CTX *ctx)
+{
+    /* RR = 2^512 mod ord(p256) */
+    static const BN_ULONG RR[P256_LIMBS] = {TOBN(0x83244c95,0xbe79eea2),
+                                            TOBN(0x4699799c,0x49bd6fa6),
+                                            TOBN(0x2845b239,0x2b6bec59),
+                                            TOBN(0x66e12d94,0xf3d95620)};
+    /* The constant 1 (unlike ONE that is one in Montgomery representation) */
+    static const BN_ULONG CONST_ONE[P256_LIMBS] = {TOBN(0,1),
+                                                   TOBN(0,0),
+                                                   TOBN(0,0),
+                                                   TOBN(0,0)};
+    /* expLo - the low 128bit of the exponent we use (ord(p256) - 2),
+       split into 4bit windows */
+    static const unsigned char expLo[32] = {0xb,0xc,0xe,0x6,
+                                            0xf,0xa,0xa,0xd,
+                                            0xa,0x7,0x1,0x7,
+                                            0x9,0xe,0x8,0x4,
+                                            0xf,0x3,0xb,0x9,
+                                            0xc,0xa,0xc,0x2,
+                                            0xf,0xc,0x6,0x3,
+                                            0x2,0x5,0x4,0xf};
+
+    BN_ULONG table[P256_LIMBS*15];
+    BN_ULONG out[P256_LIMBS], t[P256_LIMBS];
+    int i, ret = 0;
+    BIGNUM *tmp;
+
+    if ((BN_num_bits(x) > 256)
+        || BN_is_negative(x)) {
+        if ((tmp = BN_CTX_get(ctx)) == NULL) {
+            ECerr(EC_F_ECP_NISTZ256_INV_ORD, ERR_R_BN_LIB);
+            goto err;
+        }
+        if (!BN_nnmod(tmp, x, group->order, ctx)) {
+            ECerr(EC_F_ECP_NISTZ256_INV_ORD, ERR_R_BN_LIB);
+            goto err;
+        }
+        x = tmp;
+    }
+    /* We don't use entry 0 in the table, so we address with -1 offset */
+    ecp_nistz256_bignum_to_field_elem(out, x);
+    ecp_nistz256_ord_mul_mont(&table[0*P256_LIMBS], out, RR);
+    for ( i = 2; i < 16; i+=2 ) {
+        ecp_nistz256_ord_sqr_mont(&table[(i-1)*P256_LIMBS],
+                                  &table[(i/2-1)*P256_LIMBS], 1);
+        ecp_nistz256_ord_mul_mont(&table[i*P256_LIMBS],
+                                  &table[(i-1)*P256_LIMBS],
+                                  &table[0*P256_LIMBS]);
+    }
+    /* The top 128bit of the exponent are highly redundant,
+       so we perform an optimized flow */
+    /* f */
+    memcpy(out, &table[(15-1)*P256_LIMBS], sizeof(out));
+    /* f0 */
+    ecp_nistz256_ord_sqr_mont(out, out, 4);
+    /* ff */
+    ecp_nistz256_ord_mul_mont(out, out, &table[(15-1)*P256_LIMBS]);
+    memcpy(t, out, sizeof(t));
+    /* ff00 */
+    ecp_nistz256_ord_sqr_mont(out, out, 8);
+    /* ffff */
+    ecp_nistz256_ord_mul_mont(out, out, t);
+    memcpy(t, out, sizeof(t));
+    /* ffff0000 */
+    ecp_nistz256_ord_sqr_mont(out, out, 16);
+    /* ffffffff */
+    ecp_nistz256_ord_mul_mont(out, out, t);
+    memcpy(t, out, sizeof(t));
+    /* ffffffff0000000000000000 */
+    ecp_nistz256_ord_sqr_mont(out, out, 64);
+    /* ffffffff00000000ffffffff */
+    ecp_nistz256_ord_mul_mont(out, out, t);
+    /* ffffffff00000000ffffffff00000000 */
+    ecp_nistz256_ord_sqr_mont(out, out, 32);
+    /* ffffffff00000000ffffffffffffffff */
+    ecp_nistz256_ord_mul_mont(out, out, t);
+
+    /* The bottom 128 bit of the exponent are easier done with a table */
+    for( i = 0; i < 32; i++ ) {
+        ecp_nistz256_ord_sqr_mont(out, out, 4);
+        /* The exponent is public, no need in constant time access */
+        ecp_nistz256_ord_mul_mont(out, out, &table[(expLo[i]-1)*P256_LIMBS]);
+    }
+    ecp_nistz256_ord_mul_mont(out, out, CONST_ONE);
+
+    if (!bn_set_words(r, out, P256_LIMBS)) {
+        ECerr(EC_F_ECP_NISTZ256_INV_ORD, ERR_R_BN_LIB);
+        goto err;
+    }
+    ret = 1;
+err:
+    return ret;
+}
+#endif
+
 __owur static int ecp_nistz256_get_affine(const EC_GROUP *group,
-                                          const EC_POINT *point,
-                                          BIGNUM *x, BIGNUM *y, BN_CTX *ctx)
+                                   const EC_POINT *point,
+                                   BIGNUM *x, BIGNUM *y, BN_CTX *ctx)
 {
     BN_ULONG z_inv2[P256_LIMBS];
     BN_ULONG z_inv3[P256_LIMBS];
@@ -1519,7 +1664,12 @@ const EC_METHOD *EC_GFp_nistz256_method(void)
         0,                                          /* field_div */
         ec_GFp_mont_field_encode,
         ec_GFp_mont_field_decode,
-        ec_GFp_mont_field_set_to_one
+        ec_GFp_mont_field_set_to_one,
+#if defined(__x86_64) || defined(__x86_64__)
+        ecp_nistz256_inv_mod_ord
+#else
+	0
+#endif
     };
 
     return &ret;
diff --git a/crypto/ecdsa/ecs_ossl.c b/crypto/ecdsa/ecs_ossl.c
index 27266e9173316..4200017fa0e51 100644
--- a/crypto/ecdsa/ecs_ossl.c
+++ b/crypto/ecdsa/ecs_ossl.c
@@ -158,9 +158,10 @@ static int ecdsa_sign_setup(EC_KEY *eckey, BN_CTX *ctx_in,
          * compute G*k using an equivalent scalar of fixed bit-length.
          */
 
-        if (!BN_add(k, k, order))
-            goto err;
-        if (BN_num_bits(k) <= BN_num_bits(order))
+        if (BN_num_bits(k) < BN_num_bits(order))
+            if (!BN_add(k, k, order))
+                goto err;
+        if (BN_num_bits(k) < BN_num_bits(order))
             if (!BN_add(k, k, order))
                 goto err;
 
@@ -195,31 +196,8 @@ static int ecdsa_sign_setup(EC_KEY *eckey, BN_CTX *ctx_in,
     }
     while (BN_is_zero(r));
 
-    /* compute the inverse of k */
-    if (EC_GROUP_get_mont_data(group) != NULL) {
-        /*
-         * We want inverse in constant time, therefore we utilize the fact
-         * order must be prime and use Fermats Little Theorem instead.
-         */
-        if (!BN_set_word(X, 2)) {
-            ECDSAerr(ECDSA_F_ECDSA_SIGN_SETUP, ERR_R_BN_LIB);
-            goto err;
-        }
-        if (!BN_mod_sub(X, order, X, order, ctx)) {
-            ECDSAerr(ECDSA_F_ECDSA_SIGN_SETUP, ERR_R_BN_LIB);
-            goto err;
-        }
-        BN_set_flags(X, BN_FLG_CONSTTIME);
-        if (!BN_mod_exp_mont_consttime
-            (k, k, X, order, ctx, EC_GROUP_get_mont_data(group))) {
-            ECDSAerr(ECDSA_F_ECDSA_SIGN_SETUP, ERR_R_BN_LIB);
-            goto err;
-        }
-    } else {
-        if (!BN_mod_inverse(k, k, order, ctx)) {
-            ECDSAerr(ECDSA_F_ECDSA_SIGN_SETUP, ERR_R_BN_LIB);
-            goto err;
-        }
+    if (!EC_GROUP_do_inverse_ord(group, k, k, ctx, 1)) {
+        goto err;
     }
 
     /* clear old values if necessary */
@@ -399,10 +377,10 @@ static int ecdsa_do_verify(const unsigned char *dgst, int dgst_len,
         goto err;
     }
     /* calculate tmp1 = inv(S) mod order */
-    if (!BN_mod_inverse(u2, sig->s, order, ctx)) {
-        ECDSAerr(ECDSA_F_ECDSA_DO_VERIFY, ERR_R_BN_LIB);
+    if (!EC_GROUP_do_inverse_ord(group, u2, sig->s, ctx, 0)) {
         goto err;
     }
+
     /* digest -> m */
     i = BN_num_bits(order);
     /*
diff --git a/include/openssl/ec.h b/include/openssl/ec.h
index 2d36dd5ea74e8..a786393671493 100644
--- a/include/openssl/ec.h
+++ b/include/openssl/ec.h
@@ -248,6 +248,17 @@ const EC_POINT *EC_GROUP_get0_generator(const EC_GROUP *group);
 */
 BN_MONT_CTX *EC_GROUP_get_mont_data(const EC_GROUP *group);
 
+/** Perfroms inversion modulo order of a EC_GROUP
+ *  \param group  EC_GROUP object
+ *  \param res    BIGNUM to which the result is copied
+ *  \param x      BIGNUM of which inverse to compute
+ *  \param ctx    BN_CTX object
+ *  \param constantTime int   if 0 - not constant time
+ *  \return 1 on success and 0 if an error occurred
+*/
+int EC_GROUP_do_inverse_ord(const EC_GROUP *group, BIGNUM *res,
+                            const BIGNUM *x, BN_CTX *ctx, int constantTime);
+
 /** Gets the order of a EC_GROUP
  *  \param  group  EC_GROUP object
  *  \param  order  BIGNUM to which the order is copied
@@ -1167,6 +1178,7 @@ void ERR_load_EC_strings(void);
 # define EC_F_EC_GROUP_SET_CURVE_GFP                      109
 # define EC_F_EC_GROUP_SET_EXTRA_DATA                     110
 # define EC_F_EC_GROUP_SET_GENERATOR                      111
+# define EC_F_EC_GROUP_ORD_INVERSE                        245
 # define EC_F_EC_KEY_CHECK_KEY                            177
 # define EC_F_EC_KEY_COPY                                 178
 # define EC_F_EC_KEY_GENERATE_KEY                         179
@@ -1212,6 +1224,7 @@ void ERR_load_EC_strings(void);
 # define EC_F_ECP_NISTZ256_WINDOWED_MUL                   242
 # define EC_F_ECP_NISTZ256_MULT_PRECOMPUTE                243
 # define EC_F_ECP_NISTZ256_PRE_COMP_NEW                   244
+# define EC_F_ECP_NISTZ256_INV_ORD                        246
 # define EC_F_O2I_ECPUBLICKEY                             152
 # define EC_F_OLD_EC_PRIV_DECODE                          222
 # define EC_F_PKEY_EC_CTRL                                197