- sections -
.data
,.text
- can be mixed in source code, but will be contiguous in resulting binary
- a program can be composed of multiple files, but there must only be one
_start
text
format: [name]: [type] [values]- an “immediate value” is equivalent to a literal in higher level languages
- data “types”:
- db - bytes
- dw - words; 2 bytes
- dd - double words; 4 bytes
- dq - quad words; 8 bytes
- label starting with
.
is a local label - global lables for pseudo-namespaces, allowing local labels with the same name to be distinguished
- rax - accumulator
- rbx - base register; used for addresssing in early cpus
- rcx - cycles (loops)
- rdx - store data during i/o ops
- rsp - addr of topmost el in hw stack
- rbp - stack frame’s base
- rsi - source index in string manip cmds
- rdi - destination index in str manip cmds
Bit(s) | Label | Description |
---|---|---|
0 | CF | Carry Flag |
1 | 1 | Reserved |
2 | PF | Parity Flag |
3 | 0 | Reserved |
4 | AF | Auxiliary Carry Flag |
5 | 0 | Reserved |
6 | ZF | Zero Flag |
7 | SF | Sign Flag |
8 | TF | Trap Flag |
9 | IF | Interrupt Enable Flag |
10 | DF | Direction Flag |
11 | OF | Overflow Flag |
12-13 | IOPL | I/O Privilege Level |
14 | NT | Nested Task |
15 | 0 | Reserved |
16 | RF | Resume Flag |
17 | VM | Virtual-8086 Mode |
18 | AC | Alignment Check / Access Control |
19 | VIF | Virtual Interrupt Flag |
20 | VIP | Virtual Interrupt Pending |
21 | ID | ID Flag |
22-63 | 0 | Reserved |
- these are the only ones displayed by
edb
(Evan’s debugger), abbreviated with one letter
Bit(s) | Label | Abbr | Description |
---|---|---|---|
0 | CF | C | Carry Flag |
2 | PF | P | Parity Flag |
4 | AF | A | Auxiliary Carry Flag |
6 | ZF | Z | Zero Flag |
7 | SF | S | Sign Flag |
8 | TF | T | Trap Flag |
10 | DF | D | Direction Flag |
11 | OF | O | Overflow Flag |
global _start
section .data
message: db 'hello, world!', 10
section .text
_start:
mov rax, 1 ; store syscall number/id
mov rdi, 1 ; syscall arg #1
mov rsi, message ; syscall arg #2
mov rdx, 14 ; syscall arg #3
syscall
- program has no exit instruction, and thus continues executing after syscall
- next address likely will contain garbage resulting in such errors
- assembler used: NASM version 2.14.02 on Ubuntu
- nasm is also available for Windows; syntax is the same but syscall and register usage is different
- Linux is used/assumed unless mentioned otherwise
- to compile and execute a program:
$ nasm -felf64 hello.asm -o hello.o $ ld -o hello hello.o $ chmod u+x hello $ ./hello hello, world! Segmentation fault (core dumped)
- often, mistakes in assembly programs result in “core dumped” messages
- if
ulimit -c
outputs0
, no cores were actually dumped; this can be for a number of reasons - for reference info and reasons why a core wasn’t dumped, see https://man7.org/linux/man-pages/man5/core.5.html
ulimit -c unlimited
should enable core dumps, in the form of acore
file- core files can be opened with
gdb
or other tools, likeradare2
r2 core
to open a core file inradare2
dr
to print registers at the time of the dumppxr @ rsp
to print the stack
- https://reverseengineering.stackexchange.com/q/16844
- https://reverseengineering.stackexchange.com/q/20434
global _start
section .data
message: db 'goodbye, world!', 10
section .text
_start:
mov rax, 1 ; store syscall number/id
mov rdi, 1 ; syscall arg #1 - fd - stdout
mov rsi, message ; syscall arg #2 - buffer address
mov rdx, 14 ; syscall arg #3 - max nr of bytes
syscall
mov rax, 60 ; exit syscall
xor rdi, rdi ; exit code
syscall
- “hardcoded” into the binary when assembly is compiled
- will be loaded in memory at runtime, and can be changed - but won’t affect the binary on the disk
- data will be padded - if declaring a word (2 bytes) but only providing one byte, the remaining bytes will be filled with 00
- variable “names” are just labels, not different from the labels used to reference function starting points
- every label corresponds to the address of the first byte (which is actually the last one in little endian)
- if data size exceeds the declared size, a warnign will be printed but as much data as fits in the variable will be taken
- if data size is less than the declared size, the remaining bytes will be padded with 00
global _start
section .data
a: db 0xAA ; 1 byte
b: db 0xBB00 ; 00 - only one byte stored; warning: byte data exceeds bounds [-w+number-overflow]
c: dw 0xCC00 ; 2 bytes: 00, CC
d: dw 0x00DD ; 2 bytes: DD, 00
e: dd 0x00EE ; 4 bytes: EE, 00, 00, 00 (data only fills two bytes, so the remaining ones will be 00)
f: dd 0x00FFFF ; 8 bytes: FF, FF, 00, 00, 00, 00, 00, 00 (data fills 3 bytes, remaining ones will be 00)
w: dw 'abcd'
; resulting .data layout:
; a b c d e f
; AA 00 00 CC DD 00 EE 00 00 00 FF FF 00 00 00 00 00 00 00
section .text
_start:
mov rax, 60 ; exit syscall
mov rdi, 0 ; exit code
syscall
- https://en.wikipedia.org/wiki/.bss
- can be used as a “scratch” to hold uninitialized data
- compiler can optimize the resulting object file by storing only the size (as opposed to the
.data
section) - declaring uninitialized data: https://www.nasm.us/doc/nasmdoc3.html#section-3.2.2
global _start
section .bss
buffer: resb 64 ; reserve 64 bytes
words1: resw 10 ; reserve 10 words (20 bytes)
words2 resw 20 ; looks like the ":" after the label name is optional
section .text
_start:
mov rax, 60 ; exit syscall
mov rdi, 0 ; exit code
syscall
- assembly doesn’t have
if
statements - but it has instructions which, after executing, will flip bits in a special register,rflags
- the following instruction can then check the
rflags
bits it is interested in cmp
andsub
are very similar; in both cases, a subtraction is performed and flags are set in the same manner;sub
will also update the first operand with the result- if the frist operand of
cmp
is greater than the second one, it will set the first bit inrflags
(known asCF
) to 0 - if the second operand is greater than the first,
CF
will be 1 - other flags: (source: https://stackoverflow.com/a/43844182)
CF - 1 if unsigned op2 > unsigned op1 OF - 1 if sign bit of OP1 != sign bit of result SF - 1 if MSB (aka sign bit) of result = 1 ZF - 1 if Result = 0 (i.e. op1=op2) AF - 1 if Carry in the low nibble of result PF - 1 if Parity of Least significant byte is even
- for example, if we use
cmp
to compare 42 and 43:
global _start
section .text
_start:
cmp 42, 43 ; scenario A - op1 < op2
; | Bit(s) | Label| Value | Description |
; |--------+------+--------+----------------------------------|
; | 0 | CF | 1 | Carry Flag |
; | 2 | PF | 1 | Parity Flag |
; | 4 | AF | 1 | Auxiliary Carry Flag |
; | 6 | ZF | 0 | Zero Flag |
; | 7 | SF | 1 | Sign Flag |
; | 8 | TF | 0 | Trap Flag |
; | 10 | DF | 0 | Direction Flag |
; | 11 | OF | 0 | Overflow Flag |
cmp 42, 41 ; scenario B - op1 > op2
; | Bit(s) | Label| Value | Description |
; |--------+------+--------+----------------------------------|
; | 0 | CF | 0 | Carry Flag |
; | 2 | PF | 0 | Parity Flag |
; | 4 | AF | 0 | Auxiliary Carry Flag |
; | 6 | ZF | 0 | Zero Flag |
; | 7 | SF | 0 | Sign Flag |
; | 8 | TF | 0 | Trap Flag |
; | 10 | DF | 0 | Direction Flag |
; | 11 | OF | 0 | Overflow Flag |
cmp 42, 42 ; scenario C - op1 = op2
; | Bit(s) | Label| Value | Description |
; |--------+------+--------+----------------------------------|
; | 0 | CF | 0 | Carry Flag |
; | 2 | PF | 1 | Parity Flag |
; | 4 | AF | 0 | Auxiliary Carry Flag |
; | 6 | ZF | 1 | Zero Flag |
; | 7 | SF | 0 | Sign Flag |
; | 8 | TF | 0 | Trap Flag |
; | 10 | DF | 0 | Direction Flag |
; | 11 | OF | 0 | Overflow Flag |
radare2
has thedrc
command, which gives the following outputs for gt, lt and eq:
0 0 1 EQ 1 1 0 NE 1 0 0 CF 1 0 0 NEG 0 0 0 OF 1 0 0 HI 1 0 1 HE 0 1 1 LO 0 1 1 LOE 0 1 1 GE 0 1 0 GT 1 0 0 LT 1 0 1 LE
- when doing comparisons, if the size of the first operand is known, it determines how many bytes will be taken from the second operand
- when an operand is read from a memory address, bytes will be reversed
- when size is ambiguous, it can be specified on either operands (for
cmp
, at least)
global _start
section .data
a: db 0xAA ; = 170
b: db 0xBB00 ; only 00 will be stored; a warning will be generated at compile time
c: dw 0xCC00
d: dw 0x00DD
; a b c d
; .data: AA0000CCDD00
section .text
_start:
mov rax, 0xAA
; rax: 00000000AA
; eax: 000000AA
; al: AA
; ah: 00
cmp rax, 0xAA ; EQ
cmp rax, [a] ; NE - rax compared with 000000DDCC0000AA - 8 bytes (because rax is 8 bytes) starting at a, but reversed because little-endian
; cmp 0xDDCC0000AA, [a] ; error: invalid combination of opcode and operands
; cmp [a], 0xDDCC0000AA ; error: operation size not specified
cmp [a], byte 0xDDCC0000AA ; EQ; warning: byte data exceeds bounds [-w+number-overflow]
cmp byte [a], 0xDDCC0000AA ; EQ; warning: byte data exceeds bounds [-w+number-overflow]
mov rcx, 0xDDCC0000AA ; EQ
; NOTE: mov is actually "movabs" in compiled code - https://reverseengineering.stackexchange.com/a/2628
; quote from http://www.ucw.cz/~hubicka/papers/amd64/node1.html:
; --------------------------------------------------------------
; The immediate operands of instructions has not been extended to 64 bits to
; keep instruction size smaller, instead they remain 32-bit sign extended.
; Additionally the movabs instruction to load arbitrary 64-bit constant into
; register and to load/store integer register from/to arbitrary constant
; 64-bit address is available.
cmp rcx, [a] ; EQ
; because 'a' is only one byte, it should be compared with other 1 byte values
cmp al, [a] ; EQ; in assembly, ~byte~ (as below) is automatically added
cmp al, byte [a] ; EQ
cmp ah, [a] ; NE
- let’s say we want to set
rbx
to 1 ifrax
is > 50, and to0
otherwise:
global _start
section .text
_start:
mov rax, 51 ; it's >50, so we want to set rbx to 1
cmp rax, 50 ; compare the two operands - will set dedicated register falgs
jl .large ; if th
- option 1 - use an unconditional jump to end the loop
global _start
section .text
_start:
mov rax, 0 ; count
.loop:
cmp rax, 10 ; reached max ?
je .done ; exit loop
; ... do some work ...
inc rax ; increment loop count
jmp .loop ; loop again
.done:
; loop completed
- option 2 - allow execution to continue when looping not neccessary
global _start
section .text
_start:
mov rax, 0 ; count
.loop:
; ... do some work ...
inc rax ; increment loop count
cmp rax, 10 ; reached max ?
jne .loop ; exit loop
.done:
; loop completed
- on x86, signed integers are represented using two’s complement
- in two’s complement, the high (leftmost) bit is the sign bit
- if the high bit is 1, the number is negative; otherwise it’s positive
- to get the negative version of a number:
- subtract 1
- invert all bits
- ex:
0101
is 5; to get -5:0101 - 1 = 0100
0100 inverted = 1011
(-5 in two’s complement)
- to convert from two’s complement to (regular?) representation, the inverse process can be used:
- invert all bits
- add 1
- ex 1:
1011
(b
in hex) is -5 in two’s complement (could also be interpreted as 11 in decimal)1011 inverted = 0100
0100 + 1 = 0101
(5 in decimal)
- ex 2:
1111
is -1 in two’s complement (could also be interpreted as 15 in decimal)1111 inverted = 0000
0000 + 1 = 0001
(1 in decimal)
- for 8 bits (1 byte):
- max unsigned value is 255
- max signed is 127 (0111_1111), min is -128 (1000_0000, 0x80; inv = 0111_1111; +1 = 1000_0000)
global _start
section .data
section .text
_start:
mov al, -1 ; al = 0xFF, 1111_1111; unsigned = 255
mov al, -2 ; al = 0xFE, 1111_1110; unsigned = 254
mov al, 255 ;
mov al, 256 ; mov al, 0 ; warning
mov al, 257 ; mov al, 1 ; warning
mov rax, 60 ; exit syscall
mov rdi, 0 ; exit code
syscall
- in assembly, there is no metadata associated with values stored memory, such as their type
- when encountering an arithmetic instruction, the CPU will “just do it” without trying to understand whether it’s a signed or unsigned number
- it will, however, set the CF or OF flag
- carry flag (CF) - set when the result cannot be represented as an unsigned value, in the available space
- no sign bit required
- has no meaning for signed numbers - signed numbers can cause overflow, but not carry
- ex: adding 1 to 0xFF will cause CF to be set
- overflow flag (OF) - set when the result cannot be represented as a signed value, in the available space
- you overflow into the sign bit
- has no meaning for unsigned numbers - unsigned numbers can carry, but not overflow
- ex: subtracting 1 from 0x80 will cause OF to be set
inc
,dec
don’t affect the carry/overflow flags- to clear the OF and CF flags:
test al, al
- the
set*
family of instructions can be used to set something based on flags- ex:
setc rax
⇒ sets rax to the same as the the carry flag (1 or 0)
- ex:
global _start
section .data
a: db 0x35
section .text
_start:
mov al, 255
add al, 1 ; CF = 1, OF = 0
mov al, 0
sub al, 1 ; CF = 1, OF = 0
mov al, 0x80 ; al = 1000_0000, 0x80, -128 in two's complement
sub al, 1 ; al = 0111_1111, 0x7f, +129 in two's copmlement; OF=1
test al, al
sub al, 0xFF ; al = 80; OF=1, CF=1
mov rax, 60 ; exit syscall
mov rdi, 0 ; exit code
syscall
- use the smallest registers that will do the job, as it would be more efficient
- ex: don’t use
eax
when all you need isal
- ex: don’t use
- significantly different from
add
andsub
: - 1) different instructions for signed/unsigned ops
- 2) results of a multiplication/divisoin may require much more space
- if each operand is 2 digits, the result might require up to 4 digits - regardless of base
- ex: 99 x 99 = 9801
- unsigned multiplications done with
mul
, divisions withdiv
; both instructions have signed counterparts (imul
,idiv
) - the size of the operand will be used to termine how many bytes of
rax
to take as the first operand mul
result can span two registers; below, “operand” refers to the second operand - the paramter given tomul
:- if the second register is required, CF and OF will be set to 1; otherwise both will be unset (zero)
- if operand is 8 bits, the result can be stored in
ax
- high byte toah
, low byte toal
- CF/OF being set doesn’t mean there’s something
dx
, because high bits go intoah
- in all other cases, CF/OF means high bits were set in
dx/edx/rdx
- CF/OF being set doesn’t mean there’s something
- if operand is 16 bits: high bits go to
dx
, low bits toax
- if operand is 32 bits: high bits go to
edx
, low bits toeax
- if operand is 64 bits: high bits go to
rdx
, low bits torax
global _start
section .data
section .text
_start:
mov al, 0xFF ; 255
mov r8b, 0xFF ; 255
mul r8 ; multiply value in rax (al) with value in r8b -> 65025, 0xFE01
; ax: 0xfe01, CF=0, OF=0
mov ax, 0xFFFF ; 65535
mov r8, 0xFFFFF ; 1048575 (will be assembled to r8d)
mul r8b ; 65535 * 1048575 = 68718362625 (0xfffef0001) - but, because r8b is 1 byte, 255 * 255 = 0xfe01
; eax: 0xfe01, CF=1, OF=1
; same as above - but multiplying by r8w instead of r8
mov ax, 0xFFFF ; 65535
mov r8, 0xFFFFF ; 1048575
mul r8w ; 68718362625 0xfffef0001 - using two registers
; dx: 0xfffef, ax: 0001, CF=1, OF=1
mov rax, 60 ; exit syscall
mov rdi, 0 ; exit code
syscall
- unlike
mul
,div
won’t touch the flags - but can generate exceptions - it can use two registers for both input and output
- the number to be divided will be taken from
rdx:rax
- important: given the above, clearn
rdx
before dividing ! - result has two components: quotient and remainder
- if a word (2 bytes) is divided,
al
will hold the quotient,ah
the remainder) - if a dword (4 bytes) is divided, two registers will be used:
ax
- quotient,dx
remainder - similarly for 8 and 16 bytes; the quotient and remainder will span increasing parts of
rax
andrdx
, respectivelly
- if a word (2 bytes) is divided,
global _start
section .data
section .text
_start:
; the instruction param size determines how many bytes will be taken as dividend
mov ax, 0x4e9b ; 20123 (0x9b = 155)
mov r8w, 0x03e8 ; 1000 (0xe8 = 232) - only one byte (r8b) will be taken into account
div r8b
; al: 0x56, 86 (quotient)
; ah: 0xab, 171 (remainder)
; 86 * 232 = 19952 + 171 = 20123
; same as above, but dividing by r8w
mov ax, 0x4e9b ; 20_123
mov r8w, 0x03e8 ; 1_000
div r8w
; rax: 20 (0x14), rdx: 123 (0x7b)
; same as above, but eax is much larger
mov rdx, 0 ; if this is not cleared, result would be wrong !
mov eax, 0xFFFF4e9b ; 4_294_921_883
mov r8w, 0x03e8 ; 1_000
div r8w
; ax: 0x0014 (20)
; rax: 0xffff0014 ; 4_294_901_780 (only ax was set, ffff is leftover from previous operations !)
; rdx: 0x7b (123)
mov rdx, 0 ; cleanup
; to divide 4626 (0x1212) by 1000:
mov rax, 0x1212 ; 4626
mov r8w, 0x03e8 ; 1000
div r8w
; rax: 4, rdx: 626 (0x272)
mov rdx, 0 ; cleanup
; if we lave ax the same but set dx, the bits in dx will be the high bits of
; a word so 201.234 (0x31212) will be divided by 1000:
mov rdx, 0x0003
mov rax, 0x1212
mov r8w, 0x03e8 ; 1000
div r8w
; rax: 201 (0xc9), rdx: 234 (0xea)
mov rax, 60 ; exit syscall
mov rdi, 0 ; exit code
syscall
xor
or
and
shl
sar rax, cl ; if trying to use, for ex, rcx: error: invalid combination of opcode and operands
global _start
section .text
_start:
mov rax, 0xFFFFFFFFFFFFFFFF
; the following instructions will zero out the last 2 bytes (size of ax)
mov ax, 0
mov ax, 0x0
mov ax, 0x000000
; all the following instructions are written as 'mov 0' and will zero out all of rax
mov eax, 0
mov rax, 0
mov word rax, 0x00 ; warning: register size specification ignored [-w+other]
; mov rax, word 0x00 ; error: mismatch in operand sizes
; mov rax, word 0x0000 ; error: mismatch in operand sizes
mov rax, 0x0000
- on Linux, each syscall has a unique, constant integer as ID; for example, for
write
it’s 1 - syscalls are invoked with the
syscall
instruction on x86-64; on 32, interrupt 0x80 is used instead - x86-64 Linux syscalls: http://blog.rchapman.org/posts/Linux_System_Call_Table_for_x86_64/
- syscall ids are different on 32 bits compared to 64
- to invoke a syscal:
- put it’s ID in
rax
- put arguments in the corresponding register (see below)
- invoke
syscall
- put it’s ID in
- syscalls would preserve rsp, rbp, rbx, r12, r13, r14, and r15 but might trample other registers
Argument Nr. | Register |
---|---|
1st | rdi |
2nd | rsi |
3rd | rdx |
4th | r10 |
5th | r8 |
6th | r9 |
Id | System call | $1 (rdi) | $2 (rsi) | $3 (rdx) | $4 (r10) | $5 (r8) | $5 (r9) |
---|---|---|---|---|---|---|---|
0 | read | unsigned int fd | char *buf | size_t count | |||
1 | write | unsigned int fd | const char *buf | size_t count | |||
60 | exit | int error_code |
- https://github.com/torvalds/linux/blob/master/arch/x86/entry/syscalls/syscall_64.tbl
- https://www.nekosecurity.com/x86-64-assembly/part-3-nasm-anatomy-syscall-passing-argument
- https://blog.packagecloud.io/eng/2016/04/05/the-definitive-guide-to-linux-system-calls/
- “Where do you find the syscall table for Linux?” - https://unix.stackexchange.com/a/499016/39603
- some syscalls require “pointers” as args - addresses
- on a 64 bit CPU, addresses are 64 bits - 8 bytes
- instructions use square brackets to “dereference” a pointer - and take contents at the address, as opposed to the actual address itself
- so
mov rax, [foo]
can be thought of asmov rax, *foo
- the
mov
instruction is a bit misleading as it won’t move but will actually copy - the source is unchanged - how many bytes would be copied ? The first operand determines that (?: exceptions to this rule ?)
global _start
section .data
foo: db 'aaaaaaaa' ; NOTE: ASCII code for 'a' is 96, or 0x61
section .text
_start:
mov rax, foo ; 00 00 00 00 00 40 20 00 ; put the address of the first byte of foo into rax
mov rax, 0 ; 00 00 00 00 00 00 00 00 ; zero out rax
mov ah, [foo] ; 00 00 00 00 00 00 61 00 ; fill the "high" byte of ax with the first byte of foo
mov al, [foo] ; 00 00 00 00 00 00 61 61 ; fill the "low" byte of ax with the first byte of foo (note: high byte is filled from previous command)
mov ax, [foo] ; 00 00 00 00 00 00 61 61 ; ax is 2 bytes; so starting with the first byte of foo, copy 2 bytes to ax
mov eax, [foo] ; 00 00 00 00 61 61 61 61 ; eax is 4 bytes; so starting with the first byte of foo, copy 4 bytes to ax
mov rax, [foo] ; 61 61 61 61 61 61 61 61 ; rax is 8 bytes; so starting with the first byte of foo, copy 8 bytes to ax
- to load the actual address of a location in a registry, use
lea
mov
can be used instead oflea
in some cases - when address does not require adjustmentslea
is position-independent - unlikemov
(not sure what this means, TBC)lea
doesn’t actually read the contents at the adress - it only computes the addresslea
can’t be used without the square brackets in the second operand (TBC)lea
doesn’t touch the flags -lea eax, [eax + 1]
andinc eax
achieve the same, but the former doesn’t trample flags- simple
lea
, with 2 operands, is much faster than 3-operandlea
global _start
section .data
foo: db '123456789'
section .text
_start:
mov rax, foo ; load address of ~foo~ into rax
mov rax, [foo] ; load contents of ~foo~ (first 8 bytes) into rax
;lea rax, foo ; error: invalid combination of opcode and operands
lea rax, [foo] ; load address of ~foo~ into rax
lea rax, [foo + 1] ; load address of (~foo~ + 1) into rax
mov rax, [foo + 1] ; load contents of (~foo~ + 1) into rax
- when size is unknown, errors result
- the way size is specified is a bit counter-intuitive; it’s specified on the destination
global _start
section .data
foo: dq 0xBEEFBABE
bar: dq 0xDECAFBAD
section .text
_start:
;mov [foo], bar ; error: operation size not specified
;mov [foo], 1 ; error: operation size not specified
;mov byte[foo], bar ; ⇒ relocation truncated to fit: R_X86_64_8 against `.data'
;mov [foo], [bar] ; error: invalid combination of opcode and operands
mov byte [foo], 1 ; NOTE: bytes are stored in reverse, so this will leave foo as 0xBEEFBA01 (in memory: 0x01BAEFBE)
mov byte [foo], 0x8BADF00D ; warning: byte data exceeds bounds [-w+number-overflow] (only 0D - 1 byte - will be moved)
mov word [foo], 0x8BADF00D ; warning: word data exceeds bounds [-w+number-overflow]
mov dword [foo], 0x8BADF00D ;
mov qword [foo], 0x8BADF00D ; warning: signed dword immediate exceeds bounds [-w+number-overflow], warning: dword data exceeds bounds [-w+number-overflow]
global _start
section .data
message: db 'hello, world!', 10
section .text
_start:
mov ecx, 0 ; loop index
.loop:
mov rax, 1 ; store syscall number/id
mov rdi, 1 ; syscall arg #1 - fd - stdout
lea rsi, [message + ecx] ; syscall arg #2 - buffer address
mov rdx, 1 ; syscall arg #3 - max nr of bytes
push rcx
syscall
pop rcx
; compare & loop
inc ecx ; increment loop index
cmp ecx, 14
jne .loop
mov rax, 60 ; exit syscall
mov rdi, 0 ; exit code
syscall
global _start
section .data
hex_chars: db '0123456789ABCDEF'
section .text
_start:
mov rax, 0x1122334455667788
mov rdi, 1
mov rdx, 1
mov rcx, 64 ; cl = 0x40 = 0100 0000 = 64
.loop:
; We're going to make some changes to rax, so let's save its current value on the stack
push rax
; The rax register (like most of other ones) is 64 bits long, and each hex
; char encodes 4 bits so we process it 4 bits at a time. We keep track of
; the number of remaining bits to be processed in rcx, so it goes down by 4
; on each iteration.
; rcx: 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0100 0000 (0x40 = 64)
sub rcx, 4
; rcx: 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0011 1100 (0x3c = 60)
; Move the next 4 bits to the end of the register, by shifting right. We
; can use rcx as the number of bits to shift. On the first iteration of
; the loop, rcx (and therefore cl) will be 60; shifting right 60 bits
; leaves the first 4 bits at the end of rax. This destroys the rest of the
; bits, but that's OK because we've pushed the original value of rax to
; the stack.
; : 1 1 2 2 3 3 4 4 5 5 6 6 7 7 8 8
; rax: 0001 0001 0010 0010 0011 0011 0100 0100 0101 0101 0110 0110 0111 0111 1000 1000
sar rax, cl
; rax: 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0001 (0x1)
; Get rid of all bits except the last 4.This is not useful for the first
; iteration, but in the following ones there will be more bits left after
; shifting. For example, after shifting 52 bits (3rd iteration), rax is
; 0x112 - but we only want the last 4 bits:
; rax: 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0001 0001 0010 (0x0112 = 274)
; 0xf: 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 1111 (0x000f = 15)
; rax: 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0010 (0x0002 = 2) ⇐ result after `and rax, 0xf`
; On the first iteration:
; rax: 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0001 (0x1)
; 0xf: 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 1111 (0xf = 15)
and rax, 0xf
; rax: 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0001 (0x01)
; Start preparing for syscall to print. The second argument is the address
; of what to print. We don't start with the first argument (syscall id)
; because that goes into rax, and we need rax' current value. We interpret the
; 4 bits in rax as a number, which is used to index into the `hex_chars` array.
; For example, if the 4 bits are 1010, that's 10 in decimal; the 10th item in the
; array is the corresponding hex char for 10, 'A' - which would get printed by the
; 'write' syscall invoked a bit later.
lea rsi, [hex_chars + rax]
; Now we can set the first param for the syscall
mov rax, 1
; Syscalls can change rcx, so let's save it's value
push rcx
; inovke syscall - prints the character encoded by the current value of
syscall
; restore rcx
pop rcx
; restore value of rax
pop rax
; check if we need to loop again (we have more bits to print)
cmp ecx, 0
jne .loop
; invoke exit syscall
mov rax, 60
mov rdi, 0
syscall
- the
call <addr>
instruction is equivalent topush rip; jmp <addr>
- so the current address is saved on the stack, and the CPU starts executing from <addr>
- the first 6 arguments are passed like for syscalls, except the 4th one:
- additional args can be passed on the stack
- the
ret
instruction goes back the last address stored on the stack; equivalent topop rip
- functions must take due dilligence and leave the stack exactly as it was when the function started
- functions need to preserve the following registers:
rbx
,rbp
,rsp
,r12-15
- functions are free to trample the registers not enumerated above
- the above is not enforced in hardware, it is (the most common?) convention
- return value goes into
rax
Argument Nr. | Register |
---|---|
1st | rdi |
2nd | rsi |
3rd | rdx |
4th | rcx |
5th | r8 |
6th | r9 |
section .data
newline: db 10 ; 10 in decimal, 0A in hex - ASCII for newline char
hex_chars: db '0123456789ABCDEF'
section .text
print_newline: ; () -> void
mov rax, 1 ; write syscall
mov rdi, 1 ; stdout
lea rsi, [newline] ; addr of newline char
mov rdx, 1 ; count
syscall
ret
print_hex: ; (rdi - number to print as hex) -> void
mov rax, rdi
mov rcx, 64
.loop:
sub rcx, 4
push rax
sar rax, cl
and rax, 0xF
push rcx ; will be trampled by syscall
mov rdi, 1 ; stdout
lea rsi, [hex_chars + rax] ; addr
mov rdx, 1 ; how many bytes to print
mov rax, 1 ; syscall id
syscall
pop rcx
pop rax
cmp rcx, 0
jne .loop
ret
exit_ok:
mov rax, 60 ; exit syscall
mov rdi, 0 ; exit code
syscall
ret
mov rax, 60 ; exit syscall
mov rdi, 0 ; exit code
syscall
<<functions>>
global _start
_start:
mov rdi, 0xABCDEF0123456789 ; 1st param
call print_hex
call print_newline ; function without args
call exit_ok
- x86-64 - “little endian” ⇒ in memory, data is stored with the least significant byte first
- ths does not apply to the bits inside the bytes, or to registers - just memory
<<functions>>
global _start
section .data
foo: db 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08
bar: db 0x0102030405060708
baz: dq 0x0102030405060708
section .text
_start:
mov rdi, [foo]
call print_hex ; ⇒ 0807060504030201 - reverse order, because little endian
call print_newline
mov rdi, [bar]
call print_hex ; ⇒ 0203040506070808 - only the last byte, 08, was taken because bar is 'db' - the rest come from foo
call print_newline
mov rdi, [baz]
call print_hex ; ⇒ 0102030405060708 - all 8 bytes are stored, in the "correct" order because the whole value is stored as one
call print_newline
call exit_ok
<<functions>>
global _start
section .data
foo: db "foobar", 0
bar: db "abc", 0
section .text
; Calculate the length of a null-terminated string
; $1/rdi: pointer (address) to string start
; returns/rax: string length
str_len:
mov rsi, 0 ; init len 0
.loop:
mov dl, byte [rdi + rsi]
cmp dl, 0
je .end
inc rsi
jmp .loop
.end:
mov rax, rsi
ret
_start:
; calc foo's length
mov rdi, foo
call str_len
mov rdi, rax
call print_hex
call print_newline
; calc bar's length
mov rdi, bar
call str_len
mov rdi, rax
call print_hex
call print_newline
; exit
call exit_ok
- the stack is a simple mechanism, we can
push
andpop
stuff - but there is no “allocation” per se - anything in the stack can be accessed, but we must know the address - relative to
rbp~/~rsp
- this is conter-intuitive, but the stack grows downwards - so when something is pushed,
rsp
decreases push
will place data on the stack, and decreasersp
by 8 - even when less than 8 bytes are pushed- AMD manual mentions different opcodes for
push
, depending on how many bytes are pushed- for one byte immediate values, it’s 6A and for other sizes (2, 4, 8) it’s 68
- I’m not sure how this matters, because even if you push 1 byte,
rsp
still goes down by 8
global _start
section .text
_start:
push 'a' ; 1 byte
push 'abc' ; 4 bytes
push 100 ; 1 byte
<<functions>>
global _start
section .data
foo: dq 0x64 ; 100
bar: dq 0x100 ; 256
baz: dq 0x12345678 ; 305,419,896
max: dq 0xFFFFFFFF ; 4,294,967,295
section .text
; Print the input buffer as a 64-bit unsigned int
; $1/rdi: pointer (address) to first byte
print_uint:
mov rcx, 10
mov rax, [rdi]
mov r8, 0 ; counts number of digits
.loop:
inc r8
mov rdx, 0 ; used by div as the high word
div rcx ; division result ⇒ rax; remainder ⇒ rdx
add rdx, 48 ; add 48 to get ascii code
push rdx
cmp rax, 0
jne .loop
mov rax, 1 ; syscall id
mov rdx, 1 ; how many bytes to print - for syscall
mov rdi, 1 ; stdout
.print_loop:
mov rsi, rsp ; top of stack - ASCII code of decimal digit
syscall
pop r9
dec r8
cmp r8, 0
jne .print_loop
ret
_start:
; print 100
mov rdi, foo
call print_uint
call print_newline
; print 256
mov rdi, bar
call print_uint
call print_newline
; print 305419896
mov rdi, baz
call print_uint
call print_newline
; exit
call exit_ok
<<functions>>
global _start
section .data
foo: dq 0xffffffffffffffff ; -1
bar: dq 0xfffffffffffffffb ; -5
baz: dq 0x5 ; +5
section .text
; Print the input buffer as a 64-bit unsigned int
; $1/rdi: pointer (address) to first byte
print_uint:
mov rcx, 10
mov rax, [rdi]
mov r8, 0 ; counts number of digits
.loop:
inc r8
mov rdx, 0 ; used by div as the high word
div rcx ; division result ⇒ rax; remainder ⇒ rdx
add rdx, 48 ; add 48 to get ascii code
push rdx
cmp rax, 0
jne .loop
mov rax, 1 ; syscall id
mov rdx, 1 ; how many bytes to print - for syscall
mov rdi, 1 ; stdout
.print_loop:
mov rsi, rsp ; top of stack - ASCII code of decimal digit
syscall
pop r9
dec r8
cmp r8, 0
jne .print_loop
ret
; Print the input buffer as a 64-bit signed int, represented as two's complement
; $1/rdi: pointer (address) to first byte
print_int:
; get ascii for the sign (+: 43, -: 45)
mov r9, [rdi] ; copy the number
sar r9, 63 ; shift 63 bits out
cmp r9, 0 ; 0 - positive
je .set_pos
jne .set_neg
; if nr is negative, store ascii for minus in r8, and convert from two's complement in r9
.set_neg:
mov r8, 45 ; ascii for - (minus)
; get the unsigned value
mov r9, [rdi]
not r9 ; invert all bits
add r9, 1 ; add 1
jmp .print_sign
; if the value is positive, store ascii for '+' in r8 and simply copy the value to r9
.set_pos:
mov r8, 43 ; ascii for '+'
mov r9, [rdi]
; print the sign
.print_sign:
mov rax, 1 ; syscall id
mov rdx, 1 ; how many bytes to print - for syscall
mov rdi, 1 ; stdout
push r8 ; put r8 (the ascii of the sign) on the stack, so we get an address to give to 'write' syscall
mov rsi, rsp ; what to print - ascii of sign
syscall
pop r8
; use previously defined function to print r9
push r9
mov rdi, rsp
call print_uint
pop r9
ret
_start:
; print -1
mov rdi, foo
call print_int
call print_newline
; print -5
mov rdi, bar
call print_int
call print_newline
; print 5
mov rdi, baz
call print_int
call print_newline
; exit
call exit_ok
- read a char and return it in
rax
<<functions>>
global _start
section .data
prompt: db 'Enter a character: '
section .text
_start:
mov rax, 1 ; syscall id = 1 (write)
mov rdi, 1 ; syscall $1, fd = 1 (stdout)
mov rsi, prompt; syscall $2, *buf = &prompt (addr from where to take bytes to be written)
mov rdx, 19 ; syscall $3, count = 19 (how many bytes to write)
syscall
mov rax, 0 ; syscall id = 0 (read)
mov rdi, 0 ; syscall $1, fd = 0 (stdin)
push 0 ; zero out the top of the stack, where
mov rsi, rsp ; syscall $2, *buf = rsp (addr where to put read byte)
mov rdx, 1 ; syscall $3, count = 1 (how many bytes to read)
syscall
pop rax ; return value
call exit_ok ; exit
- x64 Assembly - Constructing and using a stack string - https://www.youtube.com/watch?v=SVQjbcXXOFc
read
syscall: https://linuxhint.com/read_syscall_linux/
global _start
section .bss
buf resb 10
section .text
; Read a word from stdin, terminate it with a 0 and place it at the given
; address. Only the first word will be read; any characters exceeding the
; maximum will be truncated.
; - $1, rdi: *buf - where to place read bytes
; - $2, rsi: max_count, including the NULL terminator
; Returns in rax:
; - *buf - address of the first byte where the NULL-terminated string was placed
; - 0, if input too big
read_word: ; (rdi: *buf, rsi: max_count) -> *buf, or 0 if input too big
mov r8, 0 ; current count
mov r9, rsi ; max count
dec r9 ; one char will be occupied by the terminating 0
mov r10, 0 ; 0 - no non-ws chars so far; 1 - reading; 2 - done
; read a char into the top of the stack, then pop it into rax
.next_char:
push rdi ; save; will be clobbered by syscall
push 0 ; top of the stack will be used to place read byte
mov rax, 0 ; syscall id = 0 (read)
mov rdi, 0 ; syscall $1, fd = 0 (stdin)
mov rsi, rsp ; syscall $2, *buf = rsp (addr where to put read byte)
mov rdx, 1 ; syscall $3, count (how many bytes to read)
syscall
pop rax
pop rdi
; if read character is LF or 0, exit
cmp rax, 0x0a ; LF, Enter
je .exit
cmp rax, 0x00 ; NULL - Ctrl + D ⇒ exit with err
je .err
; is the read character whitespace ?
cmp rax, 0x20 ; space
je .whitespace
cmp rax, 0x0d ; CR
je .whitespace
cmp rax, 0x09 ; tab
je .whitespace
jmp .not_whitespace
.whitespace:
cmp r10, 1 ; are we in a word ?
jne .next_char ; not in a word ? just read the next char
mov r10, 2 ; in a word ? end it
jmp .next_char ; ended the word; read next char
.not_whitespace:
cmp r10, 2 ; word terminated ? read next char
je .next_char
cmp r8, r9 ; check if we still have room
jb .add_char_start_word ; add char if we do; start word (r10 = 1) if not already started
mov r10, 2 ; we get here only if r8 >= r9 ⇒ no more room
jmp .next_char ; there might still be whitespace in the kernel buffer
.add_char_start_word:
cmp r10, 1
je .add_char
mov r10, 1
.add_char:
mov byte [rdi+r8], al ; copy character into output buffer
inc r8 ; inc number of collected characters
jmp .next_char
.exit:
mov byte [rdi+r8], 0
mov rax, rdi
ret
.err:
mov rax, 0
ret
_start:
push 0
mov rdi, buf ; $1 - *buf
mov rsi, 10 ; $2 - uint count
call read_word
cmp rax, 0 ; if error, just exit
je .exit
; print the read word
mov rax, 1 ; syscall id
mov rdx, 10 ; how many bytes to print - for syscall
mov rdi, 1 ; stdout
mov rsi, buf ; source of data
syscall
; print newline
push 0x0a
mov rax, 1 ; syscall id
mov rdx, 1 ; how many bytes to print - for syscall
mov rdi, 1 ; stdout
mov rsi, rsp ; source of data
syscall
pop rax
.exit:
mov rax, 60 ; exit syscall
mov rdi, 0 ; exit code
syscall
global _start
section .data
a: db '291', 0 ; ok; 2 = 0x02, 29 = 0x1d, 291 = 0x123
b: db '000291', 0 ; ok
c: db 'x291', 0 ; err 1 - parsing failed
d: db '18446744073709551616', 0 ; err 2 - overflow
e: db '18446744073709551615', 0 ; err 2 - overflow
ok: db 'OK', 0
failed: db 'FAILED', 0
newline: db 10 ; 10 in decimal, 0A in hex - ASCII for newline char
section .text
; If $1 is an ASCII code for digit, return the digit as an unsigned int.
; Experimenting with a calling convention returning two values. If the function
; succeeds in producing a value, rax will be the produced value, and r11b will be 0.
; Otherwise, rax will be 0, and r11b will hold an error code.
; OK: u8 (rax)
; Err: 0 - no error; 1 - not a digit char (r11b)
get_digit:
cmp dil, 48 ; 0x30 ; = '0'
jb .err
cmp dil, 57 ; 0x39 ; = '9'
ja .err
mov al, dil
sub al, 48
mov r11b, 0
ret
.err:
mov r11b, 1
ret
; Takes a null-terminated string and parses it as a 64-bit unsigned int.
; Max number that can be read: 18.446.744.073.709.551.615. (18.5 bln bln)
; ----------------------------------------------------------------------
; OK: resulting number (rax)
; Err: 0 - no error; 1 - parsing failed, 2 - oveflow - number too big (r11b)
; ----------------------------------------------------------------------
parse_int:
mov r9, 0 ; count
mov r10, 0 ; return value
push r12
xor r12, r12
.next_char:
mov dl, byte [rdi + r9] ; dl - rdx, not rdi
; check if string terminated
cmp dl, 0
je .exit_ok
; not terminated; check and add digit
push rdi ; save
mov rdi, 0
mov dil, dl
call get_digit
pop rdi ; restore
cmp r11b, 0
jne .err_parse
mov cl, al ; save digit
mov rax, r10 ; prep multiplication of curr nr by 10
mov r11, 10 ; multiplication factor = 10
mul r11 ; multiply - result goes into rax
; check if overflow from multiply
setc r12b
cmp r12b, 1
je .err_overflow
mov r12b, cl ; get last digit
add rax, r12 ; r11 was multiplied by 10 and ends with 0; add latest digit
; check if overflow from addition
setc r12b
cmp r12b, 1
je .err_overflow
mov r10, rax
inc r9
jmp .next_char
.err_parse:
pop r12
mov rax, 0
mov r11, 1
ret
.err_overflow:
pop r12
mov rax, 0
mov r11b, 2
ret
.exit_ok:
pop r12
mov r11, 0
mov rax, r10
ret
_start:
; test macro
%macro test 3
mov rdi, %1
call parse_int
cmp rax, %2
jne %%fail
cmp r11b, %3
jne %%fail
; print 'ok' and a newline
mov rax, 1 ; syscall id
mov rdx, 2 ; how many bytes to print - for syscall
mov rdi, 1 ; stdout
mov rsi, ok ; source of data
syscall
mov rax, 1 ; write syscall
mov rdi, 1 ; stdout
lea rsi, [newline] ; addr of newline char
mov rdx, 1 ; count
syscall
jmp %%ok
%%fail:
; print 'failed', and a newline
mov rax, 1 ; syscall id
mov rdx, 7 ; how many bytes to print - for syscall
mov rdi, 1 ; stdout
mov rsi, failed ; source of data
syscall
mov rax, 1 ; write syscall
mov rdi, 1 ; stdout
lea rsi, [newline] ; addr of newline char
mov rdx, 1 ; count
syscall
%%ok:
%endmacro
; a: db '291', 0 ; ok; 2 = 0x02, 29 = 0x1d, 291 = 0x123
test a, 291, 0
; b: db '000291', 0 ; ok
test b, 291, 0
; c: db 'x291', 0 ; err 1 - parsing failed
test c, 0, 1
; d: db '18446744073709551616', 0 ; err 2 - overflow
test d, 0, 2
; e: db '18446744073709551615', 0 ; ok
test e, 18446744073709551615, 0
mov rax, 60 ; exit syscall
mov rdi, r11 ; exit code
syscall
- nice quick-start, but format is a bit weird: https://www.nasm.us/doc/nasmdoc2.html#section-2.2
- Anatomy of a system call, part 1 - https://lwn.net/Articles/604287
- https://github.com/torvalds/linux/tree/master/include/uapi/linux
- examples: https://www.csee.umbc.edu/portal/help/nasm/sample_64.shtml
- very good, but 32-bit: https://asmtutor.com/#lesson2
- good: https://cs.lmu.edu/~ray/notes/nasmtutorial/
- videos: https://www.youtube.com/playlist?list=PLetF-YjXm-sCH6FrTz4AQhfH6INDQvQSn
- instructions reference, compiled from Intel manuals: https://www.felixcloutier.com/x86/
global _start
section .data
section .text
_start:
mov rax, 60 ; exit syscall
mov rdi, 0 ; exit code
syscall