Permalink
Browse files

Add support for quick heap check failures.

Whenever we reach fill up an allocation buffer we get a heap check
failure.  So far this meant falling back to the interpreter, executing
one iteration in the interpreter and then re-enter the trace.  If a GC
is necessary this approach is fine.  Most of the time, however, we
just need to grab a new buffer and update the heap pointer and the
heap limit and nothing else.  Falling back to the interpreter for this
very frequent case causes a fair amount of overhead.

This patch uses a special handler that re-enters the trace if we only
needed to grab a new buffer and only exits the trace if GC is
required.

Benchmarks (elapsed MUT time):

               ghc-7.0.3 -O2 | lcvm (old) | lcvm (new)
              -----------------------------------------
  SumFromTo1       2.24s         2.42s       2.08s
  SumSquare1       2.50s         2.64s       2.08s

Yes, it's weird that lcvm now takes the same time, but they generate a
different number of traces, so I'm not accidentally running the wrong
code.
  • Loading branch information...
1 parent 1b111c9 commit cfa83d78c81005cc9d675d87d48489f1886b6fc1 @nominolo committed Sep 23, 2012
Showing with 298 additions and 9 deletions.
  1. +149 −4 vm/amd64/fragment.cc
  2. +73 −3 vm/assembler.cc
  3. +7 −0 vm/assembler.hh
  4. +8 −0 vm/capability.hh
  5. +4 −1 vm/ir.cc
  6. +34 −1 vm/ir.hh
  7. +1 −0 vm/jit.hh
  8. +2 −0 vm/main.cc
  9. +16 −0 vm/memorymanager.cc
  10. +4 −0 vm/memorymanager.hh
View
@@ -2,6 +2,7 @@
#include "jit.hh"
#include "thread.hh"
#include "assembler.hh"
+#include "capability.hh"
#include <iostream>
@@ -20,9 +21,29 @@ exitTrace(ExitNo n, ExitState *s) {
}
}
+extern "C" int LC_USED
+heapCheckFail(ExitState *s)
+{
+ Capability *cap = s->T->owner();
+ Word *hpold = (Word*)s->gpr[RID_HP];
+ Word *hplim = s->hplim;
+ int ok = cap->heapCheckFailQuick((char **)&s->gpr[RID_HP], (char **)&s->hplim);
+#if (DEBUG_COMPONENTS & DEBUG_TRACE_ENTEREXIT)
+ cerr << "HeapCheckFail ";
+ if (ok == 0) {
+ cerr << "QUICK hp/lim = " << hpold << '/' << hplim << " => "
+ << (Word*)s->gpr[RID_HP] << '/' << s->hplim << endl;
+ } else {
+ cerr << "SLOW (GC necessary)" << endl;
+ }
+#endif
+ return ok;
+}
+
#define ASM_ENTER NAME_PREFIX "asmEnter"
#define ASM_EXIT NAME_PREFIX "asmExit"
#define ASM_TRACE NAME_PREFIX "asmTrace"
+#define ASM_HEAP_OVERFLOW NAME_PREFIX "asmHeapOverflow"
#define SAVE_SIZE (80 + 256 * sizeof(Word))
@@ -64,7 +85,7 @@ Finding such bugs must be extremely tricky (although Valgrind may
help.) I found it because I read an [article on the x86-64 ABI][1].
[1]: http://eli.thegreenplace.net/2011/09/06/stack-frame-layout-on-x86-64/
-
+
*/
static void LC_USED
@@ -73,7 +94,7 @@ asmEnterIsImplementedInAssembly(TraceId F_id, Thread *T,
Word *stacklim, MCode *code) {
/* rdi = F_id, rsi = T, rdx = hp, rcx = hplim,
r8 = stacklim, r9 = code */
-
+
asm volatile(
".globl " ASM_ENTER "\n"
ASM_ENTER ":\n\t"
@@ -261,13 +282,13 @@ asmTraceIsImplementedInAssembly(void) {
"call " NAME_PREFIX "debugTrace\n\t"
"addq $128, %%rsp\n\t" /* deallocate space for xmm registers */
-
+
/* r12-r15 are guarenteed to be the same. r14 contains the return
address. */
"movq 0(%%rsp), %%rax\n\t"
"movq 8(%%rsp), %%rcx\n\t"
"movq 16(%%rsp), %%rdx\n\t"
- /* "movq 24(%rsp), %%rbx\n\t" // callee-save */
+ /* "movq 24(%rsp), %%rbx\n\t" // callee-save */
/* "movq 32(%rsp), %%rsp\n\t" */
"movq 40(%%rsp), %%rbp\n\t"
"movq 48(%%rsp), %%rsi\n\t"
@@ -288,4 +309,128 @@ asmTraceIsImplementedInAssembly(void) {
: : );
}
+static void LC_USED
+asmHeapBufOverflowDummy(void) {
+ asm volatile(
+ ".globl " ASM_HEAP_OVERFLOW "\n"
+ ASM_HEAP_OVERFLOW ":\n\t"
+
+ /* We're calling into C code, so let's save callee-save registers
+ first. Depending on the result of the C function we then have to
+ save all the ExitState or we just return.
+ */
+
+ /* Frame:
+
+ +----------------+
+ rsp + 16 | saved rdi | becomes: r15
+ +----------------+
+ rsp + 8 | saved rsi | r14
+ +----------------+
+ rsp + 0 | return address | r13
+ +----------------+
+
+ */
+ "subq %%rsi, %%r12\n\t"
+ "pushq %%r12\n\t"
+ "pushq %%r11\n\t"
+ "pushq %%r10\n\t"
+ "pushq %%r9\n\t"
+ "pushq %%r8\n\t"
+ "pushq %%rdi\n\t" // the exit number
+ "pushq %%rsi\n\t" // bytes
+ "pushq %%rbp\n\t"
+
+ "lea 88(%%rsp), %%rbp\n\t"
+ "pushq %%rbp\n\t"
+
+ "pushq %%rbx\n\t"
+ "pushq %%rdx\n\t"
+ "pushq %%rcx\n\t"
+ "pushq %%rax\n\t"
+
+ /* make room for XMM registers */
+ "sub $128, %%rsp\n\t" /* make room for them on the stack */
+ "sub $128, %%rbp\n\t" /* skip past the int regs */
+
+ /* TODO: Are XMM registers caller-save? */
+
+ /* call heap overflow handler: rdi = ExitState* */
+ "movq %%rsp, %%rdi\n\t"
+ "call " NAME_PREFIX "heapCheckFail\n\t"
+
+ "test %%eax, %%eax\n\t"
+ "jnz .L1\n\t"
+
+ /* Common case: we jump back to the trace code. */
+ "addq $128, %%rsp\n\t" /* deallocate XMM registers */
+
+ "movq 0(%%rsp), %%rax\n\t"
+ "movq 8(%%rsp), %%rcx\n\t"
+ "movq 16(%%rsp), %%rdx\n\t"
+ "movq 24(%%rsp), %%rbx\n\t"
+ // rsp
+ "movq 40(%%rsp), %%rbp\n\t"
+ "movq 48(%%rsp), %%rsi\n\t"
+ // Don't need to restore %rdi
+ "movq 64(%%rsp), %%r8\n\t"
+ "movq 72(%%rsp), %%r9\n\t"
+ "movq 80(%%rsp), %%r10\n\t"
+ "movq 88(%%rsp), %%r11\n\t"
+ "movq 96(%%rsp), %%r12\n\t" // new Heap pointer
+ "addq $104, %%rsp\n\t" // Top of stack = return address
+ "ret\n" // Jump back to trace.
+
+ ".L1:\n\t"
+ /* Uncommon case: we need to do GC and fall back to the
+ interpreter. */
+
+ /* TODO: Save XMM registers. */
+
+ /* Save r13-r15 and get exit number. */
+ /* [rsp + 0] = xmm15; [rsp + 32 * 8] = trace rsp;
+ [rsp + 31 * 8] = saved rdi = rsp + 248
+ [rsp + 30 * 8] = saved rsi = rsp + 240
+ [rsp + 29 * 8] = return address = rsp + 232
+
+ [rsp + (16 + 7) * 8] = exit no = rsp + 184
+ [rsp + (16 + 6) * 8] = bytes = rsp + 176
+ */
+
+ "movl 184(%%rsp), %%edi\n\t" // exit no.
+ "movq 248(%%rsp), %%rax\n\t" // saved rdi
+ "movq %%rax, 184(%%rsp)\n\t" // expected saved location for rdi
+
+ "movl 176(%%rsp), %%esi\n\t" // bytes
+ "addq %%rsi, 224(%%rsp)\n\t" // re-increment r12 (yes, messy)
+ "movq 240(%%rsp), %%rax\n\t" // saved rsi
+ "movq %%rax, 176(%%rsp)\n\t" // expected save location for rsi
+
+ "mov %%r15, 248(%%rsp)\n\t"
+ "mov %%r14, 240(%%rsp)\n\t"
+ "mov %%r13, 232(%%rsp)\n\t"
+ "mov %%rsp, %%rsi\n\t"
+
+ "call " NAME_PREFIX "exitTrace\n\t"
+
+ /* We can't increment the stack pointer just yet. */
+ "leaq %c0(%%rsp), %%rax\n\t"
+
+ /* restore callee saved registers */
+ "movq -8(%%rax),%%rbx\n\t"
+ "movq -16(%%rax),%%r12\n\t"
+ "movq -24(%%rax),%%r13\n\t"
+ "movq -32(%%rax),%%r14\n\t"
+ "movq -40(%%rax),%%r15\n\t"
+
+ /* deallocate the stack used */
+ "addq %0, %%rsp\n\t"
+
+ /* return to original caller */
+ "pop %%rbp\n\t"
+ "ret\n\t"
+
+ : : "i"(SAVE_SIZE + 256));
+}
+
_END_LAMBDACHINE_NAMESPACE
View
@@ -97,7 +97,9 @@ void Assembler::setupRegAlloc() {
// the prev() fields of the input instructions are valid.
// This won't be the case after this function finishes.
void Assembler::setup(IRBuffer *buf) {
- buf->setHeapOffsets();
+ numHeapChecks_ = buf->setHeapOffsets();
+ mcQuickHeapCheck_ = NULL;
+
setupRegAlloc();
ir_ = buf->buffer_; // REF-biased
@@ -762,6 +764,46 @@ bool Assembler::mergeWithParent() {
return true;
}
+#define QUICK_HEAP_CHECK_FAIL_SIZE 24
+
+void
+Assembler::heapCheckFailure(SnapNo snapno, MCode *retryAddress, MCode *p, int32_t bytes)
+{
+ // We generate the following code (25 bytes):
+ //
+ // 0: 57 push %rdi
+ // 1: 56 push %rsi
+ // 2: bf <exitno> mov $<exitno>,%edi
+ // 7: be <bytes> mov $<bytes>,%rdi
+ // 12: e8 00 00 00 00 callq asmHeapOverflow
+ // 17: 5e pop %rsi
+ // 18: 5f pop %rdi
+ // 19: e9 00 00 00 00 jmpq <retaddr>
+
+ *p++ = XI_PUSH + RID_EDI;
+ *p++ = XI_PUSH + RID_ESI;
+
+ *p++ = XI_MOVri + RID_EDI;
+ *(int32_t *)p = (int32_t)snapno;
+ p += 4;
+
+ *p++ = XI_MOVri + RID_ESI;
+ *(int32_t *)p = bytes;
+ p += 4;
+
+ *p++ = XI_CALL;
+ p += 4;
+ *(int32_t *)(p - 4) = jmprel(p, (MCode *)(void *)asmHeapOverflow);
+
+ *p++ = XI_POP + RID_ESI;
+ *p++ = XI_POP + RID_EDI;
+
+ *p++ = XI_JMP;
+ p += 4;
+ *(int32_t *)(p - 4) = jmprel(p, retryAddress);
+}
+
+
// Adjust the Trace ID to the new target:
//
// movl <TraceId>, <F_ID_OFFS>(%rsp)
@@ -786,6 +828,12 @@ MCode *emitSetTraceId(MCode *p, TraceId traceId) {
}
void Assembler::prepareTail(IRBuffer *buf, IRRef saveref) {
+ if (jit()->getOption(Jit::kOptFastHeapCheckFail) &&
+ numHeapChecks_ > 0) {
+ mctop -= QUICK_HEAP_CHECK_FAIL_SIZE;
+ mcQuickHeapCheck_ = mctop;
+ }
+
MCode *p = mctop;
if (saveref && ir(saveref)->op1()) {
// The SAVE instruction loops back somewhere. Reserve space for
@@ -1020,13 +1068,35 @@ void Assembler::fieldLoad(IR *ins) {
void Assembler::heapCheck(IR *ins) {
int32_t bytes = sizeof(Word) * ins->op1();
- if (bytes != 0) {
+
+ if (bytes == 0)
+ return;
+
+ if (mcQuickHeapCheck_ != NULL) {
+
+ MCode *p = mcp;
+ Snapshot &snap = buf_->snap(snapno_);
+ *(int32_t *)(p - 4) = jmprel(p, mcQuickHeapCheck_);
+ p[-5] = (MCode)(XI_JCCn + (CC_A & 15));
+ p[-6] = 0x0f;
+ mcp = p - 6;
+ snap.mcode_ = NULL; // just in case
+
+ emit_rmro(XO_CMP, RID_HP | REX_64, RID_ESP | REX_64, HPLIM_SP_OFFS);
+ emit_rmro(XO_LEA, RID_HP | REX_64, RID_HP | REX_64, bytes);
+
+ MCode *retryAddr = mcp;
+ heapCheckFailure(snapno_, retryAddr, mcQuickHeapCheck_, bytes);
+ mcQuickHeapCheck_ = NULL;
+
+ } else {
+
guardcc(CC_A);
// HpLim == [rsp + HPLIM_SP_OFFS]
emit_rmro(XO_CMP, RID_HP | REX_64, RID_ESP | REX_64, HPLIM_SP_OFFS);
- // TODO: Emit guard
+ // HpLim = HpLim + bytes
emit_rmro(XO_LEA, RID_HP | REX_64, RID_HP | REX_64, bytes);
}
}
View
@@ -249,6 +249,7 @@ typedef enum {
XI_JMP = 0xe9,
XI_JMPs = 0xeb,
XI_PUSH = 0x50, /* Really 50+r. */
+ XI_POP = 0x58, /* Really 58+r. */
XI_JCCs = 0x70, /* Really 7x. */
XI_JCCn = 0x80, /* Really 0f8x. */
XI_LEA = 0x8d,
@@ -539,6 +540,9 @@ public:
// Emits code to increment the value of the value at the target
// address.
void incrementCounter(uint64_t *counterAddr);
+
+ void heapCheckFailure(SnapNo snapno, MCode *retaddr, MCode *p, int32_t bytes);
+
void assemble(IRBuffer *, MachineCode *);
void transfer(RegSpill dst, RegSpill src, ParAssign *assign);
@@ -644,6 +648,9 @@ private:
MCode *mcbot; // Bottom of reserved MCode
MCode *mctop; // Top of generated MCode
+ MCode *mcQuickHeapCheck_;
+ uint32_t numHeapChecks_;
+
Jit *jit_;
IR *ir_;
IRBuffer *buf_;
View
@@ -46,6 +46,8 @@ public:
// take effect.
void setState(int state);
+ inline int heapCheckFailQuick(char **heap, char **hplim);
+
private:
typedef enum {
kModeInit,
@@ -88,6 +90,12 @@ private:
friend class Fragment;
};
+inline int
+Capability::heapCheckFailQuick(char **heap, char **hplim)
+{
+ return mm_->bumpAllocatorFullNoGC(heap, hplim);
+}
+
_END_LAMBDACHINE_NAMESPACE
#endif /* _CAPABILITY_H_ */
View
@@ -385,16 +385,18 @@ TRef IRBuffer::emitNEW(IRRef1 itblref, int nfields, HeapEntry *out) {
return tref;
}
-void IRBuffer::setHeapOffsets() {
+uint32_t IRBuffer::setHeapOffsets() {
int offset = 0;
IRRef href = chain_[IR::kNEW];
IRRef chkref = chain_[IR::kHEAPCHK];
IRRef cur = href;
+ uint32_t heapchecks = 0;
for (;;) {
while (chkref > cur) {
// Check that we've reserved exactly the amount needed.
LC_ASSERT((int)ir(chkref)->op1() == -offset);
offset = 0;
+ ++heapchecks;
chkref = ir(chkref)->prev();
}
if (!cur)
@@ -407,6 +409,7 @@ void IRBuffer::setHeapOffsets() {
}
// Non-zero offset indicates missing heap check.
LC_ASSERT(offset == 0);
+ return heapchecks;
}
AbstractStack::AbstractStack()
Oops, something went wrong.

0 comments on commit cfa83d7

Please sign in to comment.