Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Mem extension (part 1) #1166

Merged
merged 24 commits into from
Mar 22, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
108 changes: 108 additions & 0 deletions fiat-amd64/10_ratio11074_seed2040113663254706_square_poly1305.asm
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
SECTION .text
GLOBAL square_poly1305
square_poly1305:
sub rsp, 0x30 ; last 0x30 (6) for Caller - save regs

;instead of
; mov [ rsp + 0x0 ], rbx; saving to stack

; we also want to use
mov [ rsp ], rbx; saving to stack

mov [ rsp + 0x8 ], rbp; saving to stack
mov [ rsp + 0x10 ], r12; saving to stack
mov [ rsp + 0x18 ], r13; saving to stack
mov [ rsp + 0x20 ], r14; saving to stack
mov [ rsp + 0x28 ], r15; saving to stack

; instead of
;imul rax, [ rsi + 0x10 ], 0x5; x1 <- arg1[2] * 0x5

; we want also to use
mov rax, [ rsi + 0x10 ]
lea rax, [ rax + 4 * rax ]

; instead of
;imul r10, rax, 0x2; x2 <- x1 * 0x2

; we also want to use
lea r10, [ 2 * rax]

mov rdx, [ rsi + 0x0 ]; arg1[0] to rdx
mulx r11, rbx, [ rsi + 0x0 ]; x16, x15<- arg1[0] * arg1[0]

; instead of
; imul r10, r10, 0x2; x10000 <- x2 * 0x2

;we want to use
lea r10, [r10 + r10]

; we also still need to support basic addressing with REG+off
mov rdx, [ rsi + 0x8 ]; arg1[1] to rdx

mulx r10, rbp, r10; x8, x7<- arg1[1] * x10000
xor r12, r12
adox rbp, rbx
adox r11, r10
mov rdx, [ rsi + 0x10 ]; arg1[2] to rdx
mulx rax, r13, rax; x6, x5<- arg1[2] * x1
mov r14, 0xfffffffffff ; moving imm to reg
mov r15, rbp; x22, copying x17 here, cause x17 is needed in a reg for other than x22, namely all: , x21, x22, size: 2
and r15, r14; x22 <- x17&0xfffffffffff
imul rdx, [ rsi + 0x8 ], 0x2; x4 <- arg1[1] * 0x2
mulx rdx, rcx, [ rsi + 0x0 ]; x14, x13<- arg1[0] * x4
shrd rbp, r11, 44; x21 <- x19||x17 >> 44
xor r8, r8
adox r13, rcx
adox rdx, rax
adcx r13, rbp
adc rdx, 0x0
imul r12, [ rsi + 0x8 ], 0x2; x10001 <- arg1[1] * 0x2
mov r9, r13; x34, copying x31 here, cause x31 is needed in a reg for other than x34, namely all: , x34, x35, size: 2
shrd r9, rdx, 43; x34 <- x33||x31 >> 43
imul rbx, [ rsi + 0x10 ], 0x2; x3 <- arg1[2] * 0x2
mov rdx, [ rsi + 0x8 ]; arg1[1] to rdx
mulx r12, r10, r12; x10, x9<- arg1[1] * x10001
mov rdx, [ rsi + 0x0 ]; arg1[0] to rdx
mulx rbx, r11, rbx; x12, x11<- arg1[0] * x3
xor rdx, rdx
adox r10, r11
adox rbx, r12
adcx r10, r9
adc rbx, 0x0
mov r8, r10; x39, copying x36 here, cause x36 is needed in a reg for other than x39, namely all: , x39, x40, size: 2
shrd r8, rbx, 43; x39 <- x38||x36 >> 43
imul r8, r8, 0x5; x41 <- x39 * 0x5
mov rax, 0x7ffffffffff ; moving imm to reg
and r13, rax; x35 <- x31&0x7ffffffffff
lea r15, [ r15 + r8 ]
mov rcx, r15; x44, copying x42 here, cause x42 is needed in a reg for other than x44, namely all: , x44, x43, size: 2
and rcx, r14; x44 <- x42&0xfffffffffff
shr r15, 44; x43 <- x42>> 44
lea r15, [ r15 + r13 ]
mov rbp, r15; x47, copying x45 here, cause x45 is needed in a reg for other than x47, namely all: , x47, x46, size: 2
and rbp, rax; x47 <- x45&0x7ffffffffff
shr r15, 43; x46 <- x45>> 43
and r10, rax; x40 <- x36&0x7ffffffffff
mov [ rdi + 0x8 ], rbp; out1[1] = x47
lea r15, [ r15 + r10 ]
mov [ rdi + 0x0 ], rcx; out1[0] = x44
mov [ rdi + 0x10 ], r15; out1[2] = x48
mov rbx, [ rsp + 0x0 ]; restoring from stack
mov rbp, [ rsp + 0x8 ]; restoring from stack
mov r12, [ rsp + 0x10 ]; restoring from stack
mov r13, [ rsp + 0x18 ]; restoring from stack
mov r14, [ rsp + 0x20 ]; restoring from stack
mov r15, [ rsp + 0x28 ]; restoring from stack
add rsp, 0x30
ret
; cpu Intel(R) Core(TM) i7-10710U CPU @ 1.10GHz
; clocked at 1600 MHz
; first cyclecount 15.38, best 10.657863145258103, lastGood 10.66546329723225
; seed 2040113663254706
; CC / CFLAGS clang / -march=native -mtune=native -O3
; time needed: 15590 ms / 500 runs=> 31.18ms/run
; Time spent for assembling and measureing (initial batch_size=833, initial num_batches=101): 13489 ms
; Ratio (time for assembling + measure)/(total runtime for 500runs): 0.8652341244387428
; number reverted permutation/ tried permutation: 221 / 270 =81.852%
; number reverted decision/ tried decision: 169 / 231 =73.160%
5 changes: 3 additions & 2 deletions src/Assembly/Equality.v
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ Require Import Crypto.Util.Bool.Reflect.
Require Import Crypto.Util.ListUtil.
Require Import Crypto.Util.Tactics.DestructHead.
Require Import Crypto.Util.Notations.
Require Import Crypto.Util.Prod.
Require Import Crypto.Assembly.Syntax.

Delimit Scope REG_scope with REG.
Expand Down Expand Up @@ -48,8 +49,8 @@ Bind Scope MEM_scope with MEM.

Definition MEM_beq (x y : MEM) : bool
:= ((Bool.eqb x.(mem_is_byte) y.(mem_is_byte))
&& (x.(mem_reg) =? y.(mem_reg))%REG
&& (option_beq REG_beq x.(mem_extra_reg) y.(mem_extra_reg))
&& (option_beq REG_beq x.(mem_base_reg) y.(mem_base_reg))
&& (option_beq (prod_beq _ _ Z.eqb REG_beq) x.(mem_scale_reg) y.(mem_scale_reg))
&& (option_beq Z.eqb x.(mem_offset) y.(mem_offset)))%bool.
Global Arguments MEM_beq !_ !_ / .

Expand Down
110 changes: 0 additions & 110 deletions src/Assembly/EquivalenceProofs.v
Original file line number Diff line number Diff line change
Expand Up @@ -2037,116 +2037,6 @@ Proof.
eauto 20. }
Qed.

(* TODO: move or remove *)
Lemma Address_ok G s s' (sa:AddressSize:=64%N) a rn lo sz idx base_val
(Hreg : index_and_shift_and_bitcount_of_reg (mem_reg a) = (rn, lo, sz))
(H64 : sz = 64%N)
(H : @Address sa a s = Success (idx, s'))
(d := s.(dag_state))
(d' := s'.(dag_state))
(r := s.(symbolic_reg_state))
(r' := s'.(symbolic_reg_state))
(f := s.(symbolic_flag_state))
(f' := s'.(symbolic_flag_state))
(m := s.(symbolic_mem_state))
(m' := s'.(symbolic_mem_state))
(Hd : gensym_dag_ok G d)
(Hr : forall idx', get_reg r rn = Some idx' -> eval_idx_Z G d idx' base_val)
(Hindex : mem_extra_reg a = None)
(offset := match mem_offset a with
| Some s => s
| None => 0%Z
end)
: (eval_idx_Z G d' idx (Z.land (base_val + offset) (Z.ones 64))
/\ r = r'
/\ f = f'
/\ m = m')
/\ gensym_dag_ok G d'
/\ (forall e n, eval G d e n -> eval G d' e n).
Proof.
cbv [Address Symbolic.App Merge Symbolic.bind] in H.
subst sa offset d d'.
repeat first [ progress inversion_ErrorT
| progress destruct_head'_prod
| progress inversion_option
| progress cbn [fst snd] in *
| progress destruct_head'_and
| progress cbv [update_dag_with] in *
| match goal with
| [ H : GetReg _ _ = _ |- _ ]
=> eapply GetReg_ok in H; [ | eassumption .. ]
| [ H : ErrorT.bind ?x _ = _ |- _ ]
=> destruct x eqn:?; unfold ErrorT.bind at 1 in H
| [ H : ?v = Success _ |- _ ]
=> match v with
| context[match ?x with Some _ => _ | _ => _ end]
=> destruct x eqn:?
end
end ].
(* need to do this early to deal with conversion slowness *)
repeat first [ match goal with
| [ H : context[simplify ?s ?n] |- _ ]
=> unshelve epose proof (@eval_simplify _ s n _ _); shelve_unifiable;
[ solve [ repeat first [ eassumption | solve [ eauto ] | exactly_once econstructor ] ] | ];
generalize dependent (simplify s n); intros
| [ H : context[merge ?e ?d] |- _ ]
=> pose proof (@eval_merge _ e _ d ltac:(eassumption) ltac:(eassumption));
generalize dependent (merge e d); intros
| [ H : (?x, ?y) = (?z, ?w) |- _ ] => is_var x; is_var z; is_var w; inversion H; clear H
end
| progress destruct_head'_prod
| progress destruct_head'_and
| progress subst
| progress cbn [fst snd symbolic_reg_state symbolic_flag_state symbolic_mem_state dag_state fold_right] in *
| break_innermost_match_hyps_step ].
all: repeat first [ solve [ eauto 10 ]
| apply conj
| progress cbv [eval_idx_Z]
| match goal with
| [ H : eval ?G ?d ?e ?v |- eval ?G ?d' ?e ?v' ]
=> cut (v' = v);
[ solve [ intros ->; eauto 10 ]
| ]
end
| progress change (Z.of_N 64) with 64%Z
| rewrite !Z.land_ones by lia
| progress autorewrite with zsimplify_const
| progress (push_Zmod; pull_Zmod) ].
Qed.

(* TODO: move or remove *)
Lemma Address_ok_bounded G s s' (sa:AddressSize:=64%N) a rn lo sz idx base_val
(Hreg : index_and_shift_and_bitcount_of_reg (mem_reg a) = (rn, lo, sz))
(H64 : sz = 64%N)
(H : @Address sa a s = Success (idx, s'))
(d := s.(dag_state))
(d' := s'.(dag_state))
(r := s.(symbolic_reg_state))
(r' := s'.(symbolic_reg_state))
(f := s.(symbolic_flag_state))
(f' := s'.(symbolic_flag_state))
(m := s.(symbolic_mem_state))
(m' := s'.(symbolic_mem_state))
(Hd : gensym_dag_ok G d)
(Hr : forall idx', get_reg r rn = Some idx' -> eval_idx_Z G d idx' base_val)
(Hindex : mem_extra_reg a = None)
(offset := match mem_offset a with
| Some s => s
| None => 0%Z
end)
(Hv : (0 <= base_val + offset < 2^64)%Z)
: (eval_idx_Z G d' idx (base_val + offset)
/\ r = r'
/\ f = f'
/\ m = m')
/\ gensym_dag_ok G d'
/\ (forall e n, eval G d e n -> eval G d' e n).
Proof.
set (v := (base_val + offset)%Z) in *.
replace v with (Z.land v (Z.ones (Z.of_N 64))); [ eapply Address_ok; eassumption | rewrite land_ones_eq_of_bounded by assumption ].
reflexivity.
Qed.

Lemma compute_stack_base_G_ok G G' s s' rv
(rsp_val : Z) (stack_size : nat)
(H : compute_stack_base_G rsp_val stack_size (G, s) = Some (rv, (G', s')))
Expand Down
60 changes: 40 additions & 20 deletions src/Assembly/Parse.v
Original file line number Diff line number Diff line change
Expand Up @@ -174,21 +174,40 @@ Definition parse_FLAG : ParserAction FLAG

Definition parse_MEM : ParserAction MEM
:= parse_map
(fun '(has_byte, (r, (r', maybe_pm_z)))
=> {| mem_is_byte := if has_byte:option _ then true else false
; mem_reg := r
; mem_extra_reg := r'
; mem_offset := match maybe_pm_z with
| inl (inl _ (* plus *), z) => Some z
| inl (inr _ (* minus *), z) => Some (-z)
| inr _ (* only whitespace *) => None
end%Z |})
(((strip_whitespace_after "byte ")?) ;;
(strip_whitespace_after "[" ;;R
parse_REG ;;
((strip_whitespace_around "+" ;;R parse_REG)?) ;;
((strip_whitespace_before ("+" ||->{id} "-") ;; parse_Z_arith_strict) ||->{id} parse_any_whitespace) ;;L
"]")).
(fun '(has_byte, (br (*base reg*), sr (*scale reg, including z *), offset))
=> {| mem_is_byte := if has_byte:option _ then true else false
; mem_base_reg := br:option REG
; mem_scale_reg := sr:option (Z * REG)
; mem_offset := offset:option Z |})
(((strip_whitespace_after "byte ")?) ;;
("["
;;R
(
(*[rax] *)
(parse_map
(fun r => (Some r, None, None))
(strip_whitespace_around parse_REG))
|| (*[rax + 0x10]*)
(parse_map
(fun '(br, (pm, z)) => (Some br, None, Some match pm with
| inl _ (* plus *) => z
| inr _ (* minus *) => -z
end%Z))
(strip_whitespace_around parse_REG ;; ("+" ||->{id} "-") ;; parse_Z_arith_strict))
|| (*[rax + rbx] *)
(parse_map
(fun '(br, sr) => (Some br, Some (1%Z, sr), None))
(strip_whitespace_around parse_REG ;; "+" ;;R strip_whitespace_around parse_REG))
|| (*[rax + 2 * rbx] *)
(parse_map
(fun '(br, (z, sr)) => (Some br, Some (z, sr), None))
(strip_whitespace_around parse_REG ;; "+" ;;R parse_Z_arith_strict ;; "*" ;;R strip_whitespace_around parse_REG))
|| (*[ 2 * rax] *)
(parse_map
(fun '(z, r) => (None, Some (z, r), None))
(parse_Z_arith_strict ;; "*" ;;R strip_whitespace_around parse_REG))
)
;;L "]")).

Definition parse_CONST (const_keyword : bool) : ParserAction CONST
:= if const_keyword
Expand Down Expand Up @@ -271,11 +290,12 @@ Global Instance show_lvl_MEM : ShowLevel MEM
:= fun m
=> (if m.(mem_is_byte) then show_lvl_app (fun 'tt => "byte") else show_lvl)
(fun 'tt
=> "[" ++ (show m.(mem_reg))
++ (match m.(mem_extra_reg) with
| None => ""
| Some r => " + " ++ show r
end)
=> "[" ++ (match m.(mem_base_reg), m.(mem_scale_reg) with
| (*"[Reg]" *) Some br, None => show_REG br
| (*"[Reg + Z * Reg]"*) Some br, Some (z, sr) => show_REG br ++ " + " ++ Decimal.show_Z z ++ " * " ++ show_REG sr (*only matching '+' here, because there cannot be a negative scale. *)
| (*"[ Z * Reg]"*) None, Some (z, sr) => Decimal.show_Z z ++ " * " ++ show_REG sr
| (*"[ ]"*) None, None => "" (* impossible, because only offset is invalid, but we seem to need it for coq because both are option's*)
end%Z)
++ (match m.(mem_offset) with
| None => ""
| Some offset
Expand Down