235 changes: 235 additions & 0 deletions polly/lib/CodeGen/PerfMonitor.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,235 @@
//===------ PerfMonitor.cpp - Generate a run-time performance monitor. -======//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
//
//===----------------------------------------------------------------------===//

#include "polly/CodeGen/PerfMonitor.h"
#include "polly/CodeGen/RuntimeDebugBuilder.h"
#include "llvm/ADT/Triple.h"

using namespace llvm;
using namespace polly;

Function *PerfMonitor::getAtExit() {
const char *Name = "atexit";
Function *F = M->getFunction(Name);

if (!F) {
GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
FunctionType *Ty = FunctionType::get(Builder.getInt32Ty(),
{Builder.getInt8PtrTy()}, false);
F = Function::Create(Ty, Linkage, Name, M);
}

return F;
}

void PerfMonitor::addToGlobalConstructors(Function *Fn) {
const char *Name = "llvm.global_ctors";
GlobalVariable *GV = M->getGlobalVariable(Name);
std::vector<Constant *> V;

if (GV) {
Constant *Array = GV->getInitializer();
for (Value *X : Array->operand_values())
V.push_back(cast<Constant>(X));
GV->eraseFromParent();
}

StructType *ST = StructType::get(Builder.getInt32Ty(), Fn->getType(),
Builder.getInt8PtrTy(), nullptr);

V.push_back(ConstantStruct::get(
ST, Builder.getInt32(10), Fn,
ConstantPointerNull::get(Builder.getInt8PtrTy()), nullptr));
ArrayType *Ty = ArrayType::get(ST, V.size());

GV = new GlobalVariable(*M, Ty, true, GlobalValue::AppendingLinkage,
ConstantArray::get(Ty, V), Name, nullptr,
GlobalVariable::NotThreadLocal);
}

Function *PerfMonitor::getRDTSCP() {
const char *Name = "llvm.x86.rdtscp";
Function *F = M->getFunction(Name);

if (!F) {
GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
FunctionType *Ty = FunctionType::get(Builder.getInt64Ty(),
{Builder.getInt8PtrTy()}, false);
F = Function::Create(Ty, Linkage, Name, M);
}

return F;
}

PerfMonitor::PerfMonitor(Module *M) : M(M), Builder(M->getContext()) {
if (Triple(M->getTargetTriple()).getArch() == llvm::Triple::x86_64)
Supported = true;
else
Supported = false;
}

void PerfMonitor::addGlobalVariables() {
auto TryRegisterGlobal = [=](const char *Name, Constant *InitialValue,
Value **Location) {
*Location = M->getGlobalVariable(Name);

if (!*Location)
*Location = new GlobalVariable(
*M, InitialValue->getType(), true, GlobalValue::WeakAnyLinkage,
InitialValue, Name, nullptr, GlobalVariable::InitialExecTLSModel);
};

TryRegisterGlobal("__polly_perf_cycles_total_start", Builder.getInt64(0),
&CyclesTotalStartPtr);

TryRegisterGlobal("__polly_perf_initialized", Builder.getInt1(0),
&AlreadyInitializedPtr);

TryRegisterGlobal("__polly_perf_cycles_in_scops", Builder.getInt64(0),
&CyclesInScopsPtr);

TryRegisterGlobal("__polly_perf_cycles_in_scop_start", Builder.getInt64(0),
&CyclesInScopStartPtr);

TryRegisterGlobal("__polly_perf_write_loation", Builder.getInt32(0),
&RDTSCPWriteLocation);
}

static const char *InitFunctionName = "__polly_perf_init";
static const char *FinalReportingFunctionName = "__polly_perf_final";

Function *PerfMonitor::insertFinalReporting() {
// Create new function.
GlobalValue::LinkageTypes Linkage = Function::WeakODRLinkage;
FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), {}, false);
Function *ExitFn =
Function::Create(Ty, Linkage, FinalReportingFunctionName, M);
BasicBlock *Start = BasicBlock::Create(M->getContext(), "start", ExitFn);
Builder.SetInsertPoint(Start);

if (!Supported) {
RuntimeDebugBuilder::createCPUPrinter(
Builder, "Polly runtime information generation not supported\n");
Builder.CreateRetVoid();
return ExitFn;
}

// Measure current cycles and compute final timings.
Function *RDTSCPFn = getRDTSCP();
Value *CurrentCycles = Builder.CreateCall(
RDTSCPFn,
Builder.CreatePointerCast(RDTSCPWriteLocation, Builder.getInt8PtrTy()));
Value *CyclesStart = Builder.CreateLoad(CyclesTotalStartPtr, true);
Value *CyclesTotal = Builder.CreateSub(CurrentCycles, CyclesStart);
Value *CyclesInScops = Builder.CreateLoad(CyclesInScopsPtr, true);

// Print the runtime information.
RuntimeDebugBuilder::createCPUPrinter(Builder, "Polly runtime information\n");
RuntimeDebugBuilder::createCPUPrinter(Builder, "-------------------------\n");
RuntimeDebugBuilder::createCPUPrinter(Builder, "Total: ", CyclesTotal, "\n");
RuntimeDebugBuilder::createCPUPrinter(Builder, "Scops: ", CyclesInScops,
"\n");

// Finalize function.
Builder.CreateRetVoid();
return ExitFn;
}

void PerfMonitor::initialize() {
addGlobalVariables();

Function *F = M->getFunction(InitFunctionName);
if (F)
return;

// initialize
Function *FinalReporting = insertFinalReporting();
Function *InitFn = insertInitFunction(FinalReporting);
addToGlobalConstructors(InitFn);
}

Function *PerfMonitor::insertInitFunction(Function *FinalReporting) {
// Insert function definition and BBs.
GlobalValue::LinkageTypes Linkage = Function::WeakODRLinkage;
FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), {}, false);
Function *InitFn = Function::Create(Ty, Linkage, InitFunctionName, M);
BasicBlock *Start = BasicBlock::Create(M->getContext(), "start", InitFn);
BasicBlock *EarlyReturn =
BasicBlock::Create(M->getContext(), "earlyreturn", InitFn);
BasicBlock *InitBB = BasicBlock::Create(M->getContext(), "initbb", InitFn);

Builder.SetInsertPoint(Start);

// Check if this function was already run. If yes, return.
//
// In case profiling has been enabled in multiple translation units, the
// initializer function will be added to the global constructors list of
// each translation unit. When merging translation units, the global
// constructor lists are just appended, such that the initializer will appear
// multiple times. To avoid initializations being run multiple times (and
// especially to avoid that atExitFn is called more than once), we bail
// out if the intializer is run more than once.
Value *HasRunBefore = Builder.CreateLoad(AlreadyInitializedPtr);
Builder.CreateCondBr(HasRunBefore, EarlyReturn, InitBB);
Builder.SetInsertPoint(EarlyReturn);
Builder.CreateRetVoid();

// Keep track that this function has been run once.
Builder.SetInsertPoint(InitBB);
Value *True = Builder.getInt1(true);
Builder.CreateStore(True, AlreadyInitializedPtr);

// Register the final reporting function with atexit().
Value *FinalReportingPtr =
Builder.CreatePointerCast(FinalReporting, Builder.getInt8PtrTy());
Function *AtExitFn = getAtExit();
Builder.CreateCall(AtExitFn, {FinalReportingPtr});

if (Supported) {
// Read the currently cycle counter and store the result for later.
Function *RDTSCPFn = getRDTSCP();
Value *CurrentCycles = Builder.CreateCall(
RDTSCPFn,
Builder.CreatePointerCast(RDTSCPWriteLocation, Builder.getInt8PtrTy()));
Builder.CreateStore(CurrentCycles, CyclesTotalStartPtr, true);
}
Builder.CreateRetVoid();

return InitFn;
}

void PerfMonitor::insertRegionStart(Instruction *InsertBefore) {
if (!Supported)
return;

Builder.SetInsertPoint(InsertBefore);
Function *RDTSCPFn = getRDTSCP();
Value *CurrentCycles = Builder.CreateCall(
RDTSCPFn,
Builder.CreatePointerCast(RDTSCPWriteLocation, Builder.getInt8PtrTy()));
Builder.CreateStore(CurrentCycles, CyclesInScopStartPtr, true);
}

void PerfMonitor::insertRegionEnd(Instruction *InsertBefore) {
if (!Supported)
return;

Builder.SetInsertPoint(InsertBefore);
Function *RDTSCPFn = getRDTSCP();
LoadInst *CyclesStart = Builder.CreateLoad(CyclesInScopStartPtr, true);
Value *CurrentCycles = Builder.CreateCall(
RDTSCPFn,
Builder.CreatePointerCast(RDTSCPWriteLocation, Builder.getInt8PtrTy()));
Value *CyclesInScop = Builder.CreateSub(CurrentCycles, CyclesStart);
Value *CyclesInScops = Builder.CreateLoad(CyclesInScopsPtr, true);
CyclesInScops = Builder.CreateAdd(CyclesInScops, CyclesInScop);
Builder.CreateStore(CyclesInScops, CyclesInScopsPtr, true);
}
87 changes: 87 additions & 0 deletions polly/test/Isl/CodeGen/perf_monitoring.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
; RUN: opt %loadPolly -polly-codegen -polly-codegen-perf-monitoring \
; RUN: -S < %s | FileCheck %s

; void f(long A[], long N) {
; long i;
; if (true)
; for (i = 0; i < N; ++i)
; A[i] = i;
; }

target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
target triple = "x86_64-unknown-linux-gnu"

define void @f(i64* %A, i64 %N) nounwind {
entry:
fence seq_cst
br label %next

next:
br i1 true, label %for.i, label %return

for.i:
%indvar = phi i64 [ 0, %next], [ %indvar.next, %for.i ]
%scevgep = getelementptr i64, i64* %A, i64 %indvar
store i64 %indvar, i64* %scevgep
%indvar.next = add nsw i64 %indvar, 1
%exitcond = icmp eq i64 %indvar.next, %N
br i1 %exitcond, label %return, label %for.i

return:
fence seq_cst
ret void
}

; CHECK: @__polly_perf_cycles_total_start = weak thread_local(initialexec) constant i64 0
; CHECK-NEXT: @__polly_perf_initialized = weak thread_local(initialexec) constant i1 false
; CHECK-NEXT: @__polly_perf_cycles_in_scops = weak thread_local(initialexec) constant i64 0
; CHECK-NEXT: @__polly_perf_cycles_in_scop_start = weak thread_local(initialexec) constant i64 0
; CHECK-NEXT: @__polly_perf_write_loation = weak thread_local(initialexec) constant i32 0

; CHECK: polly.split_new_and_old: ; preds = %entry
; CHECK-NEXT: %0 = call i64 @llvm.x86.rdtscp(i8* bitcast (i32* @__polly_perf_write_loation to i8*))
; CHECK-NEXT: store volatile i64 %0, i64* @__polly_perf_cycles_in_scop_start

; CHECK: polly.merge_new_and_old: ; preds = %polly.exiting, %return.region_exiting
; CHECK-NEXT: %5 = load volatile i64, i64* @__polly_perf_cycles_in_scop_start
; CHECK-NEXT: %6 = call i64 @llvm.x86.rdtscp(i8* bitcast (i32* @__polly_perf_write_loation to i8*))
; CHECK-NEXT: %7 = sub i64 %6, %5
; CHECK-NEXT: %8 = load volatile i64, i64* @__polly_perf_cycles_in_scops
; CHECK-NEXT: %9 = add i64 %8, %7
; CHECK-NEXT: store volatile i64 %9, i64* @__polly_perf_cycles_in_scops
; CHECK-NEXT: br label %return


; CHECK: define weak_odr void @__polly_perf_final() {
; CHECK-NEXT: start:
; CHECK-NEXT: %0 = call i64 @llvm.x86.rdtscp(i8* bitcast (i32* @__polly_perf_write_loation to i8*))
; CHECK-NEXT: %1 = load volatile i64, i64* @__polly_perf_cycles_total_start
; CHECK-NEXT: %2 = sub i64 %0, %1
; CHECK-NEXT: %3 = load volatile i64, i64* @__polly_perf_cycles_in_scops
; CHECK-NEXT: %4 = call i32 (...) @printf(i8* getelementptr inbounds ([3 x i8], [3 x i8]* @1, i32 0, i32 0), i8 addrspace(4)* getelementptr inbounds ([27 x i8], [27 x i8] addrspace(4)* @0, i32 0, i32 0))
; CHECK-NEXT: %5 = call i32 @fflush(i8* null)
; CHECK-NEXT: %6 = call i32 (...) @printf(i8* getelementptr inbounds ([3 x i8], [3 x i8]* @3, i32 0, i32 0), i8 addrspace(4)* getelementptr inbounds ([27 x i8], [27 x i8] addrspace(4)* @2, i32 0, i32 0))
; CHECK-NEXT: %7 = call i32 @fflush(i8* null)
; CHECK-NEXT: %8 = call i32 (...) @printf(i8* getelementptr inbounds ([8 x i8], [8 x i8]* @6, i32 0, i32 0), i8 addrspace(4)* getelementptr inbounds ([8 x i8], [8 x i8] addrspace(4)* @4, i32 0, i32 0), i64 %2, i8 addrspace(4)* getelementptr inbounds ([2 x i8], [2 x i8] addrspace(4)* @5, i32 0, i32 0))
; CHECK-NEXT: %9 = call i32 @fflush(i8* null)
; CHECK-NEXT: %10 = call i32 (...) @printf(i8* getelementptr inbounds ([8 x i8], [8 x i8]* @9, i32 0, i32 0), i8 addrspace(4)* getelementptr inbounds ([8 x i8], [8 x i8] addrspace(4)* @7, i32 0, i32 0), i64 %3, i8 addrspace(4)* getelementptr inbounds ([2 x i8], [2 x i8] addrspace(4)* @8, i32 0, i32 0))
; CHECK-NEXT: %11 = call i32 @fflush(i8* null)
; CHECK-NEXT: ret void
; CHECK-NEXT: }


; CHECK: define weak_odr void @__polly_perf_init() {
; CHECK-NEXT: start:
; CHECK-NEXT: %0 = load i1, i1* @__polly_perf_initialized
; CHECK-NEXT: br i1 %0, label %earlyreturn, label %initbb

; CHECK: earlyreturn: ; preds = %start
; CHECK-NEXT: ret void

; CHECK: initbb: ; preds = %start
; CHECK-NEXT: store i1 true, i1* @__polly_perf_initialized
; CHECK-NEXT: %1 = call i32 @atexit(i8* bitcast (void ()* @__polly_perf_final to i8*))
; CHECK-NEXT: %2 = call i64 @llvm.x86.rdtscp(i8* bitcast (i32* @__polly_perf_write_loation to i8*))
; CHECK-NEXT: store volatile i64 %2, i64* @__polly_perf_cycles_total_start
; CHECK-NEXT: ret void
; CHECK-NEXT: }