Skip to content
Permalink
d6be39f33f
Switch branches/tags
Go to file
 
 
Cannot retrieve contributors at this time
6399 lines (5710 sloc) 226 KB
/*
Copyright (c) 2009-2019, Intel Corporation
All rights reserved.
Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
* Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
// written by Roman Dementiev
// Otto Bruggeman
// Thomas Willhalm
// Pat Fay
// Austen Ott
// Jim Harris (FreeBSD)
/*! \file cpucounters.cpp
\brief The bulk of PCM implementation
*/
//#define PCM_TEST_FALLBACK_TO_ATOM
#include <stdio.h>
#include <assert.h>
#ifdef PCM_EXPORTS
// pcm-lib.h includes cpucounters.h
#include "PCM-Lib_Win\pcm-lib.h"
#else
#include "cpucounters.h"
#endif
#include "msr.h"
#include "pci.h"
#include "types.h"
#include "utils.h"
#if defined (__FreeBSD__) || defined(__DragonFly__)
#include <sys/param.h>
#include <sys/module.h>
#include <sys/types.h>
#include <sys/sysctl.h>
#include <sys/sem.h>
#include <sys/ioccom.h>
#include <sys/cpuctl.h>
#include <machine/cpufunc.h>
#endif
#ifdef _MSC_VER
#include <intrin.h>
#include <windows.h>
#include <comdef.h>
#include <tchar.h>
#include "winring0/OlsApiInit.h"
#include "PCM_Win/windriver.h"
#else
#include <pthread.h>
#if defined(__FreeBSD__) || (defined(__DragonFly__) && __DragonFly_version >= 400707)
#include <pthread_np.h>
#endif
#include <errno.h>
#include <sys/time.h>
#ifdef __linux__
#include <sys/mman.h>
#endif
#endif
#include <string.h>
#include <limits>
#include <map>
#include <algorithm>
#include <thread>
#include <future>
#include <functional>
#include <queue>
#include <condition_variable>
#include <mutex>
#include <atomic>
#ifdef __APPLE__
#include <sys/types.h>
#include <sys/sysctl.h>
#include <sys/sem.h>
// convertUnknownToInt is used in the safe sysctl call to convert an unkown size to an int
int convertUnknownToInt(size_t size, char* value);
#endif
#undef PCM_UNCORE_PMON_BOX_CHECK_STATUS // debug only
#undef PCM_DEBUG_TOPOLOGY // debug of topology enumeration routine
// FreeBSD is much more restrictive about names for semaphores
#if defined (__FreeBSD__)
#define PCM_INSTANCE_LOCK_SEMAPHORE_NAME "/PCM_inst_lock"
#define PCM_NUM_INSTANCES_SEMAPHORE_NAME "/num_PCM_inst"
#else
#define PCM_INSTANCE_LOCK_SEMAPHORE_NAME "PCM inst lock"
#define PCM_NUM_INSTANCES_SEMAPHORE_NAME "Num PCM insts"
#endif
#ifdef _MSC_VER
HMODULE hOpenLibSys = NULL;
bool PCM::initWinRing0Lib()
{
const BOOL result = InitOpenLibSys(&hOpenLibSys);
if (result == FALSE)
{
hOpenLibSys = NULL;
return false;
}
BYTE major, minor, revision, release;
GetDriverVersion(&major, &minor, &revision, &release);
wchar_t buffer[128];
swprintf_s(buffer, 128, _T("\\\\.\\WinRing0_%d_%d_%d"),(int)major,(int)minor, (int)revision);
restrictDriverAccess(buffer);
return true;
}
class InstanceLock
{
HANDLE Mutex;
InstanceLock();
public:
InstanceLock(const bool global)
{
Mutex = CreateMutex(NULL, FALSE,
global?(L"Global\\Processor Counter Monitor instance create/destroy lock"):(L"Local\\Processor Counter Monitor instance create/destroy lock"));
// lock
WaitForSingleObject(Mutex, INFINITE);
}
~InstanceLock()
{
// unlock
ReleaseMutex(Mutex);
CloseHandle(Mutex);
}
};
#else // Linux or Apple
pthread_mutex_t processIntanceMutex = PTHREAD_MUTEX_INITIALIZER;
class InstanceLock
{
const char * globalSemaphoreName;
sem_t * globalSemaphore;
bool global;
InstanceLock();
public:
InstanceLock(const bool global_) : globalSemaphoreName(PCM_INSTANCE_LOCK_SEMAPHORE_NAME), globalSemaphore(NULL), global(global_)
{
if(!global)
{
pthread_mutex_lock(&processIntanceMutex);
return;
}
umask(0);
while (1)
{
//sem_unlink(globalSemaphoreName); // temporary
globalSemaphore = sem_open(globalSemaphoreName, O_CREAT, S_IRWXU | S_IRWXG | S_IRWXO, 1);
if (SEM_FAILED == globalSemaphore)
{
if (EACCES == errno)
{
std::cerr << "PCM Error, do not have permissions to open semaphores in /dev/shm/. Waiting one second and retrying..." << std::endl;
sleep(1);
}
}
else
{
/*
if (sem_post(globalSemaphore)) {
perror("sem_post error");
}
*/
break; // success
}
}
if (sem_wait(globalSemaphore)) {
perror("sem_wait error");
}
}
~InstanceLock()
{
if(!global)
{
pthread_mutex_unlock(&processIntanceMutex);
return;
}
if (sem_post(globalSemaphore)) {
perror("sem_post error");
}
}
};
#endif // end of _MSC_VER else
#if defined(__FreeBSD__)
#define cpu_set_t cpuset_t
#endif
class TemporalThreadAffinity // speedup trick for Linux, FreeBSD, DragonFlyBSD, Windows
{
TemporalThreadAffinity(); // forbiden
#if defined(__linux__) || defined(__FreeBSD__) || (defined(__DragonFly__) && __DragonFly_version >= 400707)
cpu_set_t old_affinity;
public:
TemporalThreadAffinity(uint32 core_id)
{
pthread_getaffinity_np(pthread_self(), sizeof(cpu_set_t), &old_affinity);
cpu_set_t new_affinity;
CPU_ZERO(&new_affinity);
CPU_SET(core_id, &new_affinity);
pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &new_affinity);
}
~TemporalThreadAffinity()
{
pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &old_affinity);
}
bool supported() const { return true; }
#elif defined(_MSC_VER)
ThreadGroupTempAffinity affinity;
public:
TemporalThreadAffinity(uint32 core) : affinity(core) {}
bool supported() const { return true; }
#else // not implemented for os x
public:
TemporalThreadAffinity(uint32) { }
bool supported() const { return false; }
#endif
};
PCM * PCM::instance = NULL;
/*
static int bitCount(uint64 n)
{
int count = 0;
while (n)
{
count += static_cast<int>(n & 0x00000001);
n >>= static_cast<uint64>(1);
}
return count;
}
*/
PCM * PCM::getInstance()
{
// no lock here
if (instance) return instance;
InstanceLock lock(false);
if (instance) return instance;
return instance = new PCM();
}
uint32 build_bit_ui(uint32 beg, uint32 end)
{
uint32 myll = 0;
if (end == 31)
{
myll = (uint32)(-1);
}
else
{
myll = (1 << (end + 1)) - 1;
}
myll = myll >> beg;
return myll;
}
uint32 extract_bits_ui(uint32 myin, uint32 beg, uint32 end)
{
uint32 myll = 0;
uint32 beg1, end1;
// Let the user reverse the order of beg & end.
if (beg <= end)
{
beg1 = beg;
end1 = end;
}
else
{
beg1 = end;
end1 = beg;
}
myll = myin >> beg1;
myll = myll & build_bit_ui(beg1, end1);
return myll;
}
uint64 build_bit(uint32 beg, uint32 end)
{
uint64 myll = 0;
if (end == 63)
{
myll = static_cast<uint64>(-1);
}
else
{
myll = (1LL << (end + 1)) - 1;
}
myll = myll >> beg;
return myll;
}
uint64 extract_bits(uint64 myin, uint32 beg, uint32 end)
{
uint64 myll = 0;
uint32 beg1, end1;
// Let the user reverse the order of beg & end.
if (beg <= end)
{
beg1 = beg;
end1 = end;
}
else
{
beg1 = end;
end1 = beg;
}
myll = myin >> beg1;
myll = myll & build_bit(beg1, end1);
return myll;
}
uint64 PCM::extractCoreGenCounterValue(uint64 val)
{
if(core_gen_counter_width)
return extract_bits(val, 0, core_gen_counter_width-1);
return val;
}
uint64 PCM::extractCoreFixedCounterValue(uint64 val)
{
if(core_fixed_counter_width)
return extract_bits(val, 0, core_fixed_counter_width-1);
return val;
}
uint64 PCM::extractUncoreGenCounterValue(uint64 val)
{
if(uncore_gen_counter_width)
return extract_bits(val, 0, uncore_gen_counter_width-1);
return val;
}
uint64 PCM::extractUncoreFixedCounterValue(uint64 val)
{
if(uncore_fixed_counter_width)
return extract_bits(val, 0, uncore_fixed_counter_width-1);
return val;
}
uint64 PCM::extractQOSMonitoring(uint64 val)
{
//Check if any of the error bit(63) or Unavailable bit(62) of the IA32_QM_CTR MSR are 1
if(val & (3ULL<<62))
{
// invalid reading
return static_cast<uint64>(PCM_INVALID_QOS_MONITORING_DATA);
}
// valid reading
return extract_bits(val,0,61);
}
int32 extractThermalHeadroom(uint64 val)
{
if(val & (1ULL<<31ULL))
{ // valid reading
return static_cast<int32>(extract_bits(val, 16, 22));
}
// invalid reading
return static_cast<int32>(PCM_INVALID_THERMAL_HEADROOM);
}
uint64 get_frequency_from_cpuid();
union PCM_CPUID_INFO
{
int array[4];
struct { unsigned int eax, ebx, ecx, edx; } reg;
};
void pcm_cpuid(int leaf, PCM_CPUID_INFO & info)
{
#ifdef _MSC_VER
// version for Windows
__cpuid(info.array, leaf);
#else
__asm__ __volatile__ ("cpuid" : \
"=a" (info.reg.eax), "=b" (info.reg.ebx), "=c" (info.reg.ecx), "=d" (info.reg.edx) : "a" (leaf));
#endif
}
/* Adding the new version of cpuid with leaf and subleaf as an input */
void pcm_cpuid(const unsigned leaf, const unsigned subleaf, PCM_CPUID_INFO & info)
{
#ifdef _MSC_VER
__cpuidex(info.array, leaf, subleaf);
#else
__asm__ __volatile__ ("cpuid" : \
"=a" (info.reg.eax), "=b" (info.reg.ebx), "=c" (info.reg.ecx), "=d" (info.reg.edx) : "a" (leaf), "c" (subleaf));
#endif
}
void PCM::readCoreCounterConfig()
{
if (max_cpuid >= 0xa)
{
// get counter related info
PCM_CPUID_INFO cpuinfo;
pcm_cpuid(0xa, cpuinfo);
perfmon_version = extract_bits_ui(cpuinfo.array[0], 0, 7);
core_gen_counter_num_max = extract_bits_ui(cpuinfo.array[0], 8, 15);
core_gen_counter_width = extract_bits_ui(cpuinfo.array[0], 16, 23);
if (perfmon_version > 1)
{
core_fixed_counter_num_max = extract_bits_ui(cpuinfo.array[3], 0, 4);
core_fixed_counter_width = extract_bits_ui(cpuinfo.array[3], 5, 12);
}
if (isForceRTMAbortModeAvailable() && MSR.size())
{
uint64 TSXForceAbort = 0;
if (MSR[0]->read(MSR_TSX_FORCE_ABORT, &TSXForceAbort) == sizeof(uint64))
{
TSXForceAbort &= 1;
/*
TSXForceAbort is 0 (default mode) => the number of useful gen counters is 3
TSXForceAbort is 1 => the number of gen counters is unchanged
*/
if (TSXForceAbort == 0)
{
core_gen_counter_num_max = 3;
}
}
else
{
std::cerr << "PCM Error: reading MSR_TSX_FORCE_ABORT failed. " << std::endl;
}
}
}
}
void PCM::readCPUMicrocodeLevel()
{
if (MSR.empty()) return;
const int ref_core = 0;
TemporalThreadAffinity affinity(ref_core);
if (affinity.supported() && isCoreOnline(ref_core))
{ // see "Update Signature and Verification" and "Determining the Signature"
// sections in Intel SDM how to read ucode level
if (MSR[ref_core]->write(MSR_IA32_BIOS_SIGN_ID, 0) == sizeof(uint64))
{
PCM_CPUID_INFO cpuinfo;
pcm_cpuid(1, cpuinfo); // cpuid instructions updates MSR_IA32_BIOS_SIGN_ID
uint64 result = 0;
if (MSR[ref_core]->read(MSR_IA32_BIOS_SIGN_ID, &result) == sizeof(uint64))
{
cpu_microcode_level = result >> 32;
}
}
}
}
int32 PCM::getMaxCustomCoreEvents()
{
return core_gen_counter_num_max;
}
bool PCM::detectModel()
{
char buffer[1024];
union {
char cbuf[16];
int ibuf[16/sizeof(int)];
} buf;
PCM_CPUID_INFO cpuinfo;
pcm_cpuid(0, cpuinfo);
memset(buffer, 0, 1024);
memset(buf.cbuf, 0, 16);
buf.ibuf[0] = cpuinfo.array[1];
buf.ibuf[1] = cpuinfo.array[3];
buf.ibuf[2] = cpuinfo.array[2];
if (strncmp(buf.cbuf, "GenuineIntel", 4 * 3) != 0)
{
std::cerr << getUnsupportedMessage() << std::endl;
return false;
}
max_cpuid = cpuinfo.array[0];
pcm_cpuid(1, cpuinfo);
cpu_family = (((cpuinfo.array[0]) >> 8) & 0xf) | ((cpuinfo.array[0] & 0xf00000) >> 16);
cpu_model = original_cpu_model = (((cpuinfo.array[0]) & 0xf0) >> 4) | ((cpuinfo.array[0] & 0xf0000) >> 12);
cpu_stepping = cpuinfo.array[0] & 0x0f;
if (cpuinfo.reg.ecx & (1UL<<31UL)) {
std::cerr << "Detected a hypervisor/virtualization technology. Some metrics might not be available due to configuration or availability of virtual hardware features." << std::endl;
}
readCoreCounterConfig();
if (cpu_family != 6)
{
std::cerr << getUnsupportedMessage() << " CPU Family: " << cpu_family << std::endl;
return false;
}
pcm_cpuid(7, 0, cpuinfo);
std::cout << "IBRS and IBPB supported : " << ((cpuinfo.reg.edx & (1 << 26)) ? "yes" : "no") << std::endl;
std::cout << "STIBP supported : " << ((cpuinfo.reg.edx & (1 << 27)) ? "yes" : "no") << std::endl;
std::cout << "Spec arch caps supported : " << ((cpuinfo.reg.edx & (1 << 29)) ? "yes" : "no") << std::endl;
return true;
}
bool PCM::QOSMetricAvailable() const
{
if (isSecureBoot()) return false; // TODO: use perf rdt driver
PCM_CPUID_INFO cpuinfo;
pcm_cpuid(0x7,0,cpuinfo);
return (cpuinfo.reg.ebx & (1<<12))?true:false;
}
bool PCM::L3QOSMetricAvailable() const
{
if (isSecureBoot()) return false; // TODO:: use perf rdt driver
PCM_CPUID_INFO cpuinfo;
pcm_cpuid(0xf,0,cpuinfo);
return (cpuinfo.reg.edx & (1<<1))?true:false;
}
bool PCM::L3CacheOccupancyMetricAvailable() const
{
PCM_CPUID_INFO cpuinfo;
if (!(QOSMetricAvailable() && L3QOSMetricAvailable()))
return false;
pcm_cpuid(0xf,0x1,cpuinfo);
return (cpuinfo.reg.edx & 1)?true:false;
}
bool PCM::CoreLocalMemoryBWMetricAvailable() const
{
if (cpu_model == SKX) return false; // SKZ4 errata
PCM_CPUID_INFO cpuinfo;
if (!(QOSMetricAvailable() && L3QOSMetricAvailable()))
return false;
pcm_cpuid(0xf,0x1,cpuinfo);
return (cpuinfo.reg.edx & 2)?true:false;
}
bool PCM::CoreRemoteMemoryBWMetricAvailable() const
{
if (cpu_model == SKX) return false; // SKZ4 errata
PCM_CPUID_INFO cpuinfo;
if (!(QOSMetricAvailable() && L3QOSMetricAvailable()))
return false;
pcm_cpuid(0xf, 0x1, cpuinfo);
return (cpuinfo.reg.edx & 4) ? true : false;
}
unsigned PCM::getMaxRMID() const
{
unsigned maxRMID = 0;
PCM_CPUID_INFO cpuinfo;
pcm_cpuid(0xf,0,cpuinfo);
maxRMID = (unsigned)cpuinfo.reg.ebx + 1;
return maxRMID;
}
void PCM::initRMID()
{
if (!(QOSMetricAvailable() && L3QOSMetricAvailable()))
return;
unsigned maxRMID;
/* Calculate maximum number of RMID supported by socket */
maxRMID = getMaxRMID();
// std::cout << "Maximum RMIDs per socket in the system : " << maxRMID << "\n";
std::vector<uint32> rmid(num_sockets);
for(int32 i = 0; i < num_sockets; i ++)
rmid[i] = maxRMID - 1;
/* Associate each core with 1 RMID */
for(int32 core = 0; core < num_cores; core ++ )
{
if(!isCoreOnline(core)) continue;
uint64 msr_pqr_assoc = 0 ;
uint64 msr_qm_evtsel = 0 ;
MSR[core]->lock();
//Read 0xC8F MSR for each core
MSR[core]->read(IA32_PQR_ASSOC, &msr_pqr_assoc);
//std::cout << "initRMID reading IA32_PQR_ASSOC 0x"<< std::hex << msr_pqr_assoc << std::dec << std::endl;
//std::cout << "Socket Id : " << topology[core].socket;
msr_pqr_assoc &= 0xffffffff00000000ULL;
msr_pqr_assoc |= (uint64)(rmid[topology[core].socket] & ((1ULL<<10)-1ULL));
//std::cout << "initRMID writing IA32_PQR_ASSOC 0x"<< std::hex << msr_pqr_assoc << std::dec << std::endl;
//Write 0xC8F MSR with new RMID for each core
MSR[core]->write(IA32_PQR_ASSOC,msr_pqr_assoc);
msr_qm_evtsel = static_cast<uint64>(rmid[topology[core].socket] & ((1ULL<<10)-1ULL));
msr_qm_evtsel <<= 32 ;
//Write 0xC8D MSR with new RMID for each core
//std::cout << "initRMID writing IA32_QM_EVTSEL 0x"<< std::hex << msr_qm_evtsel << std::dec << std::endl;
MSR[core]->write(IA32_QM_EVTSEL,msr_qm_evtsel);
MSR[core]->unlock();
/* Initializing the memory bandwidth counters */
memory_bw_local.push_back(std::make_shared<CounterWidthExtender>(new CounterWidthExtender::MBLCounter(MSR[core]), 24, 500));
memory_bw_total.push_back(std::make_shared<CounterWidthExtender>(new CounterWidthExtender::MBTCounter(MSR[core]), 24, 500));
rmid[topology[core].socket] --;
}
/* Get The scaling factor by running CPUID.0xF.0x1 instruction */
L3ScalingFactor = getL3ScalingFactor();
}
void PCM::initQOSevent(const uint64 event, const int32 core)
{
if(!isCoreOnline(core)) return;
uint64 msr_qm_evtsel = 0 ;
//Write 0xC8D MSR with the event id
MSR[core]->read(IA32_QM_EVTSEL, &msr_qm_evtsel);
//std::cout << "initQOSevent reading IA32_QM_EVTSEL 0x"<< std::hex << msr_qm_evtsel << std::dec << std::endl;
msr_qm_evtsel &= 0xfffffffffffffff0ULL;
msr_qm_evtsel |= event & ((1ULL<<8)-1ULL);
//std::cout << "initQOSevent writing IA32_QM_EVTSEL 0x"<< std::hex << msr_qm_evtsel << std::dec << std::endl;
MSR[core]->write(IA32_QM_EVTSEL,msr_qm_evtsel);
}
void PCM::initCStateSupportTables()
{
#define PCM_PARAM_PROTECT(...) __VA_ARGS__
#define PCM_CSTATE_ARRAY(array_ , val ) \
{ \
static uint64 tmp[] = val; \
PCM_COMPILE_ASSERT(sizeof(tmp) / sizeof(uint64) == (static_cast<int>(MAX_C_STATE)+1)); \
array_ = tmp; \
break; \
}
// fill package C state array
switch(original_cpu_model)
{
case ATOM:
case ATOM_2:
case ATOM_CENTERTON:
case ATOM_AVOTON:
case ATOM_BAYTRAIL:
case ATOM_CHERRYTRAIL:
case ATOM_APOLLO_LAKE:
case ATOM_DENVERTON:
PCM_CSTATE_ARRAY(pkgCStateMsr, PCM_PARAM_PROTECT({0, 0, 0x3F8, 0, 0x3F9, 0, 0x3FA, 0, 0, 0, 0 }) );
case NEHALEM_EP:
case NEHALEM:
case CLARKDALE:
case WESTMERE_EP:
case NEHALEM_EX:
case WESTMERE_EX:
PCM_CSTATE_ARRAY(pkgCStateMsr, PCM_PARAM_PROTECT({0, 0, 0, 0x3F8, 0, 0, 0x3F9, 0x3FA, 0, 0, 0}) );
case SANDY_BRIDGE:
case JAKETOWN:
case IVY_BRIDGE:
case IVYTOWN:
PCM_CSTATE_ARRAY(pkgCStateMsr, PCM_PARAM_PROTECT({0, 0, 0x60D, 0x3F8, 0, 0, 0x3F9, 0x3FA, 0, 0, 0}) );
case HASWELL:
case HASWELL_2:
case HASWELLX:
case BDX_DE:
case BDX:
case KNL:
PCM_CSTATE_ARRAY(pkgCStateMsr, PCM_PARAM_PROTECT({0, 0, 0x60D, 0x3F8, 0, 0, 0x3F9, 0x3FA, 0, 0, 0}) );
case SKX:
PCM_CSTATE_ARRAY(pkgCStateMsr, PCM_PARAM_PROTECT({0, 0, 0x60D, 0, 0, 0, 0x3F9, 0, 0, 0, 0}) );
case HASWELL_ULT:
case BROADWELL:
case SKL:
case SKL_UY:
case KBL:
case KBL_1:
case BROADWELL_XEON_E3:
PCM_CSTATE_ARRAY(pkgCStateMsr, PCM_PARAM_PROTECT({0, 0, 0x60D, 0x3F8, 0, 0, 0x3F9, 0x3FA, 0x630, 0x631, 0x632}) );
default:
std::cerr << "PCM error: package C-states support array is not initialized. Package C-states metrics will not be shown." << std::endl;
PCM_CSTATE_ARRAY(pkgCStateMsr, PCM_PARAM_PROTECT({ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }) );
};
// fill core C state array
switch(original_cpu_model)
{
case ATOM:
case ATOM_2:
case ATOM_CENTERTON:
PCM_CSTATE_ARRAY(coreCStateMsr, PCM_PARAM_PROTECT({ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }) );
case NEHALEM_EP:
case NEHALEM:
case CLARKDALE:
case WESTMERE_EP:
case NEHALEM_EX:
case WESTMERE_EX:
PCM_CSTATE_ARRAY(coreCStateMsr, PCM_PARAM_PROTECT({0, 0, 0, 0x3FC, 0, 0, 0x3FD, 0, 0, 0, 0}) );
case SANDY_BRIDGE:
case JAKETOWN:
case IVY_BRIDGE:
case IVYTOWN:
case HASWELL:
case HASWELL_2:
case HASWELL_ULT:
case HASWELLX:
case BDX_DE:
case BDX:
case BROADWELL:
case BROADWELL_XEON_E3:
case ATOM_BAYTRAIL:
case ATOM_AVOTON:
case ATOM_CHERRYTRAIL:
case ATOM_APOLLO_LAKE:
case ATOM_DENVERTON:
case SKL_UY:
case SKL:
case KBL:
case KBL_1:
PCM_CSTATE_ARRAY(coreCStateMsr, PCM_PARAM_PROTECT({0, 0, 0, 0x3FC, 0, 0, 0x3FD, 0x3FE, 0, 0, 0}) );
case KNL:
PCM_CSTATE_ARRAY(coreCStateMsr, PCM_PARAM_PROTECT({0, 0, 0, 0, 0, 0, 0x3FF, 0, 0, 0, 0}) );
case SKX:
PCM_CSTATE_ARRAY(coreCStateMsr, PCM_PARAM_PROTECT({0, 0, 0, 0, 0, 0, 0x3FD, 0, 0, 0, 0}) );
default:
std::cerr << "PCM error: core C-states support array is not initialized. Core C-states metrics will not be shown." << std::endl;
PCM_CSTATE_ARRAY(coreCStateMsr, PCM_PARAM_PROTECT({ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }) );
};
}
#ifdef __linux__
std::string readSysFS(const char * path, bool silent = false)
{
FILE * f = fopen(path, "r");
if (!f)
{
if (silent == false) std::cerr << "ERROR: Can not open "<< path <<" file." << std::endl;
return std::string();
}
char buffer[1024];
if(NULL == fgets(buffer, 1024, f))
{
if (silent == false) std::cerr << "ERROR: Can not read from "<< path << "." << std::endl;
fclose(f);
return std::string();
}
fclose(f);
return std::string(buffer);
}
bool writeSysFS(const char * path, const std::string & value, bool silent = false)
{
FILE * f = fopen(path, "w");
if (!f)
{
if (silent == false) std::cerr << "ERROR: Can not open " << path << " file." << std::endl;
return false;
}
if (fputs(value.c_str(), f) < 0)
{
if (silent == false) std::cerr << "ERROR: Can not write to " << path << "." << std::endl;
fclose(f);
return false;
}
fclose(f);
return true;
}
int readMaxFromSysFS(const char * path)
{
std::string content = readSysFS(path);
const char * buffer = content.c_str();
int result = -1;
pcm_sscanf(buffer) >> s_expect("0-") >> result;
if(result == -1)
{
pcm_sscanf(buffer) >> result;
}
return result;
}
#endif
bool PCM::discoverSystemTopology()
{
typedef std::map<uint32, uint32> socketIdMap_type;
socketIdMap_type socketIdMap;
PCM_CPUID_INFO cpuid_args;
// init constants for CPU topology leaf 0xB
// adapted from Topology Enumeration Reference code for Intel 64 Architecture
// https://software.intel.com/en-us/articles/intel-64-architecture-processor-topology-enumeration
int wasCoreReported = 0, wasThreadReported = 0;
int subleaf = 0, levelType, levelShift;
//uint32 coreSelectMask = 0, smtSelectMask = 0;
uint32 smtMaskWidth = 0;
//uint32 pkgSelectMask = (-1), pkgSelectMaskShift = 0;
uint32 corePlusSMTMaskWidth = 0;
uint32 coreMaskWidth = 0;
{
TemporalThreadAffinity aff0(0);
do
{
pcm_cpuid(0xb, subleaf, cpuid_args);
if (cpuid_args.array[1] == 0)
{ // if EBX ==0 then this subleaf is not valid, we can exit the loop
break;
}
levelType = extract_bits_ui(cpuid_args.array[2], 8, 15);
levelShift = extract_bits_ui(cpuid_args.array[0], 0, 4);
switch (levelType)
{
case 1: //level type is SMT, so levelShift is the SMT_Mask_Width
smtMaskWidth = levelShift;
wasThreadReported = 1;
break;
case 2: //level type is Core, so levelShift is the CorePlusSMT_Mask_Width
corePlusSMTMaskWidth = levelShift;
wasCoreReported = 1;
break;
default:
break;
}
subleaf++;
} while (1);
}
if (wasThreadReported && wasCoreReported)
{
coreMaskWidth = corePlusSMTMaskWidth - smtMaskWidth;
}
else if (!wasCoreReported && wasThreadReported)
{
coreMaskWidth = smtMaskWidth;
}
else
{
std::cerr << "ERROR: Major problem? No leaf 0 under cpuid function 11." << std::endl;
return false;
}
uint32 l2CacheMaskShift = 0;
#ifdef PCM_DEBUG_TOPOLOGY
uint32 threadsSharingL2;
#endif
uint32 l2CacheMaskWidth;
pcm_cpuid(0x4, 2, cpuid_args); // get ID for L2 cache
l2CacheMaskWidth = 1 + extract_bits_ui(cpuid_args.array[0],14,25); // number of APIC IDs sharing L2 cache
#ifdef PCM_DEBUG_TOPOLOGY
threadsSharingL2 = l2CacheMaskWidth;
#endif
for( ; l2CacheMaskWidth > 1; l2CacheMaskWidth >>= 1)
{
l2CacheMaskShift++;
}
#ifdef PCM_DEBUG_TOPOLOGY
std::cerr << "DEBUG: Number of threads sharing L2 cache = " << threadsSharingL2
<< " [the most significant bit = " << l2CacheMaskShift << "]" << std::endl;
#endif
auto populateEntry = [&smtMaskWidth, &coreMaskWidth, &l2CacheMaskShift](TopologyEntry & entry, const int apic_id)
{
entry.thread_id = extract_bits_ui(apic_id, 0, smtMaskWidth - 1);
entry.core_id = extract_bits_ui(apic_id, smtMaskWidth, smtMaskWidth + coreMaskWidth - 1);
entry.socket = extract_bits_ui(apic_id, smtMaskWidth + coreMaskWidth, 31);
entry.tile_id = extract_bits_ui(apic_id, l2CacheMaskShift, 31);
};
#ifdef _MSC_VER
// version for Windows 7 and later version
char * slpi = new char[sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX)];
DWORD len = (DWORD)sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX);
BOOL res = GetLogicalProcessorInformationEx(RelationAll, (PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX)slpi, &len);
while (res == FALSE)
{
delete[] slpi;
if (GetLastError() == ERROR_INSUFFICIENT_BUFFER)
{
slpi = new char[len];
res = GetLogicalProcessorInformationEx(RelationAll, (PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX)slpi, &len);
}
else
{
std::wcerr << "Error in Windows function 'GetLogicalProcessorInformationEx': " <<
GetLastError() << " ";
const TCHAR * strError = _com_error(GetLastError()).ErrorMessage();
if (strError) std::wcerr << strError;
std::wcerr << std::endl;
return false;
}
}
char * base_slpi = slpi;
PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX pi = NULL;
for ( ; slpi < base_slpi + len; slpi += (DWORD)pi->Size)
{
pi = (PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX)slpi;
if (pi->Relationship == RelationProcessorCore)
{
threads_per_core = (pi->Processor.Flags == LTP_PC_SMT) ? 2 : 1;
// std::cout << "thr per core: "<< threads_per_core << std::endl;
num_cores += threads_per_core;
}
}
if (num_cores != GetActiveProcessorCount(ALL_PROCESSOR_GROUPS))
{
std::cerr << "Error in processor group size counting: " << num_cores << "!=" << GetActiveProcessorCount(ALL_PROCESSOR_GROUPS) << std::endl;
std::cerr << "Make sure your binary is compiled for 64-bit: using 'x64' platform configuration." << std::endl;
return false;
}
for (int i = 0; i < (int)num_cores; i++)
{
ThreadGroupTempAffinity affinity(i);
pcm_cpuid(0xb, 0x0, cpuid_args);
int apic_id = cpuid_args.array[3];
TopologyEntry entry;
entry.os_id = i;
populateEntry(entry, apic_id);
topology.push_back(entry);
socketIdMap[entry.socket] = 0;
}
delete[] base_slpi;
#else
// for Linux, Mac OS, FreeBSD and DragonFlyBSD
TopologyEntry entry;
#ifdef __linux__
num_cores = readMaxFromSysFS("/sys/devices/system/cpu/present");
if(num_cores == -1)
{
std::cerr << "Cannot read number of present cores" << std::endl;
return false;
}
++num_cores;
// open /proc/cpuinfo
FILE * f_cpuinfo = fopen("/proc/cpuinfo", "r");
if (!f_cpuinfo)
{
std::cerr << "Cannot open /proc/cpuinfo file." << std::endl;
return false;
}
// map with key=pkg_apic_id (not necessarily zero based or sequential) and
// associated value=socket_id that should be 0 based and sequential
std::map<int, int> found_pkg_ids;
topology.resize(num_cores);
char buffer[1024];
while (0 != fgets(buffer, 1024, f_cpuinfo))
{
if (strncmp(buffer, "processor", sizeof("processor") - 1) == 0)
{
pcm_sscanf(buffer) >> s_expect("processor\t: ") >> entry.os_id;
//std::cout << "os_core_id: "<<entry.os_id<< std::endl;
TemporalThreadAffinity _(entry.os_id);
pcm_cpuid(0xb, 0x0, cpuid_args);
int apic_id = cpuid_args.array[3];
populateEntry(entry, apic_id);
topology[entry.os_id] = entry;
socketIdMap[entry.socket] = 0;
++num_online_cores;
}
}
fclose(f_cpuinfo);
// produce debug output similar to Intel MPI cpuinfo
#ifdef PCM_DEBUG_TOPOLOGY
std::cerr << "===== Processor identification =====" << std::endl;
std::cerr << "Processor Thread Id. Core Id. Tile Id. Package Id." << std::endl;
std::map<uint32, std::vector<uint32> > os_id_by_core, os_id_by_tile, core_id_by_socket;
for(auto it = topology.begin(); it != topology.end(); ++it)
{
std::cerr << std::left << std::setfill(' ')
<< std::setw(16) << it->os_id
<< std::setw(16) << it->thread_id
<< std::setw(16) << it->core_id
<< std::setw(16) << it->tile_id
<< std::setw(16) << it->socket
<< std::endl << std::flush;
if(std::find(core_id_by_socket[it->socket].begin(), core_id_by_socket[it->socket].end(), it->core_id)
== core_id_by_socket[it->socket].end())
core_id_by_socket[it->socket].push_back(it->core_id);
// add socket offset to distinguish cores and tiles from different sockets
os_id_by_core[(it->socket << 15) + it->core_id].push_back(it->os_id);
os_id_by_tile[(it->socket << 15) + it->tile_id].push_back(it->os_id);
}
std::cerr << "===== Placement on packages =====" << std::endl;
std::cerr << "Package Id. Core Id. Processors" << std::endl;
for(auto pkg = core_id_by_socket.begin(); pkg != core_id_by_socket.end(); ++pkg)
{
auto core_id = pkg->second.begin();
std::cerr << std::left << std::setfill(' ') << std::setw(15) << pkg->first << *core_id;
for(++core_id; core_id != pkg->second.end(); ++core_id)
{
std::cerr << "," << *core_id;
}
std::cerr << std::endl;
}
std::cerr << std::endl << "===== Core/Tile sharing =====" << std::endl;
std::cerr << "Level Processors" << std::endl << "Core ";
for(auto core = os_id_by_core.begin(); core != os_id_by_core.end(); ++core)
{
auto os_id = core->second.begin();
std::cerr << "(" << *os_id;
for(++os_id; os_id != core->second.end(); ++os_id) {
std::cerr << "," << *os_id;
}
std::cerr << ")";
}
std::cerr << std::endl << "Tile / L2$ ";
for(auto core = os_id_by_tile.begin(); core != os_id_by_tile.end(); ++core)
{
auto os_id = core->second.begin();
std::cerr << "(" << *os_id;
for(++os_id; os_id != core->second.end(); ++os_id) {
std::cerr << "," << *os_id;
}
std::cerr << ")";
}
std::cerr << std::endl;
#endif // PCM_DEBUG_TOPOLOGY
#elif defined(__FreeBSD__) || defined(__DragonFly__)
size_t size = sizeof(num_cores);
cpuctl_cpuid_args_t cpuid_args_freebsd;
int fd;
if(0 != sysctlbyname("hw.ncpu", &num_cores, &size, NULL, 0))
{
std::cerr << "Unable to get hw.ncpu from sysctl." << std::endl;
return false;
}
if (modfind("cpuctl") == -1)
{
std::cout << "cpuctl(4) not loaded." << std::endl;
return false;
}
for (int i = 0; i < num_cores; i++)
{
char cpuctl_name[64];
int apic_id;
snprintf(cpuctl_name, 64, "/dev/cpuctl%d", i);
fd = ::open(cpuctl_name, O_RDWR);
cpuid_args_freebsd.level = 0xb;
::ioctl(fd, CPUCTL_CPUID, &cpuid_args_freebsd);
apic_id = cpuid_args_freebsd.data[3];
entry.os_id = i;
populateEntry(entry, apic_id);
if (entry.socket == 0 && entry.core_id == 0) ++threads_per_core;
topology.push_back(entry);
socketIdMap[entry.socket] = 0;
}
#else // Getting processor info for Mac OS
#define SAFE_SYSCTLBYNAME(message, ret_value) \
{ \
size_t size; \
char *pParam; \
if(0 != sysctlbyname(message, NULL, &size, NULL, 0)) \
{ \
std::cerr << "Unable to determine size of " << message << " sysctl return type." << std::endl; \
return false; \
} \
if(NULL == (pParam = (char *)malloc(size))) \
{ \
std::cerr << "Unable to allocate memory for " << message << std::endl; \
return false; \
} \
if(0 != sysctlbyname(message, (void*)pParam, &size, NULL, 0)) \
{ \
std::cerr << "Unable to get " << message << " from sysctl." << std::endl; \
return false; \
} \
ret_value = convertUnknownToInt(size, pParam); \
free(pParam); \
}
// End SAFE_SYSCTLBYNAME
// Using OSXs sysctl to get the number of CPUs right away
SAFE_SYSCTLBYNAME("hw.logicalcpu", num_cores)
#undef SAFE_SYSCTLBYNAME
// The OSX version needs the MSR handle earlier so that it can build the CPU topology.
// This topology functionality should potentially go into a different KEXT
for(int i = 0; i < num_cores; i++)
{
MSR.push_back(std::make_shared<SafeMsrHandle>(i));
}
TopologyEntry *entries = new TopologyEntry[num_cores];
MSR[0]->buildTopology(num_cores, entries);
for(int i = 0; i < num_cores; i++){
socketIdMap[entries[i].socket] = 0;
if(entries[i].os_id >= 0)
{
if(entries[i].core_id == 0 && entries[i].socket == 0) ++threads_per_core;
topology.push_back(entries[i]);
}
}
delete[] entries;
// End of OSX specific code
#endif // end of ifndef __APPLE__
#endif //end of ifdef _MSC_VER
if(num_cores == 0) {
num_cores = (int32)topology.size();
}
if(num_sockets == 0) {
num_sockets = (int32)(std::max)(socketIdMap.size(), (size_t)1);
}
socketIdMap_type::iterator s = socketIdMap.begin();
for (uint32 sid = 0; s != socketIdMap.end(); ++s)
{
s->second = sid++;
}
for (int i = 0; (i < (int)num_cores) && (!socketIdMap.empty()); ++i)
{
if(isCoreOnline((int32)i))
topology[i].socket = socketIdMap[topology[i].socket];
}
#if 0
std::cerr << "Number of socket ids: " << socketIdMap.size() << "\n";
std::cerr << "Topology:\nsocket os_id core_id\n";
for (int i = 0; i < num_cores; ++i)
{
std::cerr << topology[i].socket << " " << topology[i].os_id << " " << topology[i].core_id << std::endl;
}
#endif
if(threads_per_core == 0)
{
for (int i = 0; i < (int)num_cores; ++i)
{
if(topology[i].socket == topology[0].socket && topology[i].core_id == topology[0].core_id)
++threads_per_core;
}
}
if(num_phys_cores_per_socket == 0) num_phys_cores_per_socket = num_cores / num_sockets / threads_per_core;
if(num_online_cores == 0) num_online_cores = num_cores;
int32 i = 0;
socketRefCore.resize(num_sockets, -1);
for(i = 0; i < num_cores; ++i)
{
if(isCoreOnline(i))
{
socketRefCore[topology[i].socket] = i;
}
}
num_online_sockets = 0;
for(i = 0; i < num_sockets; ++i)
{
if(isSocketOnline(i))
{
++num_online_sockets;
}
}
#if 0
for(int32 i=0; i< num_sockets;++i)
{
std::cout << "socketRefCore["<< i << "]=" << socketRefCore[i] << std::endl;
}
#endif
return true;
}
void PCM::printSystemTopology() const
{
if(num_cores == num_online_cores)
{
std::cerr << "Number of physical cores: " << (num_cores/threads_per_core) << std::endl;
}
std::cerr << "Number of logical cores: " << num_cores << std::endl;
std::cerr << "Number of online logical cores: " << num_online_cores << std::endl;
if(num_cores == num_online_cores)
{
std::cerr << "Threads (logical cores) per physical core: " << threads_per_core << std::endl;
}
else
{
std::cerr << "Offlined cores: ";
for (int i = 0; i < (int)num_cores; ++i)
if(isCoreOnline((int32)i) == false)
std::cerr << i << " ";
std::cerr << std::endl;
}
std::cerr << "Num sockets: " << num_sockets << std::endl;
std::cerr << "Physical cores per socket: " << num_phys_cores_per_socket << std::endl;
std::cerr << "Core PMU (perfmon) version: " << perfmon_version << std::endl;
std::cerr << "Number of core PMU generic (programmable) counters: " << core_gen_counter_num_max << std::endl;
std::cerr << "Width of generic (programmable) counters: " << core_gen_counter_width << " bits" << std::endl;
if (perfmon_version > 1)
{
std::cerr << "Number of core PMU fixed counters: " << core_fixed_counter_num_max << std::endl;
std::cerr << "Width of fixed counters: " << core_fixed_counter_width << " bits" << std::endl;
}
}
bool PCM::initMSR()
{
#ifndef __APPLE__
try
{
for (int i = 0; i < (int)num_cores; ++i)
{
if (isCoreOnline((int32)i))
MSR.push_back(std::make_shared<SafeMsrHandle>(i));
else // the core is offlined, assign an invalid MSR handle
MSR.push_back(std::make_shared<SafeMsrHandle>());
}
}
catch (...)
{
// failed
MSR.clear();
std::cerr << "Can not access CPUs Model Specific Registers (MSRs)." << std::endl;
#ifdef _MSC_VER
std::cerr << "You must have signed msr.sys driver in your current directory and have administrator rights to run this program." << std::endl;
#elif defined(__linux__)
std::cerr << "Try to execute 'modprobe msr' as root user and then" << std::endl;
std::cerr << "you also must have read and write permissions for /dev/cpu/*/msr devices (/dev/msr* for Android). The 'chown' command can help." << std::endl;
#elif defined(__FreeBSD__) || defined(__DragonFly__)
std::cerr << "Ensure cpuctl module is loaded and that you have read and write" << std::endl;
std::cerr << "permissions for /dev/cpuctl* devices (the 'chown' command can help)." << std::endl;
#endif
return false;
}
#endif
return true;
}
bool PCM::detectNominalFrequency()
{
if (MSR.size())
{
uint64 freq = 0;
MSR[socketRefCore[0]]->read(PLATFORM_INFO_ADDR, &freq);
const uint64 bus_freq = (
cpu_model == SANDY_BRIDGE
|| cpu_model == JAKETOWN
|| cpu_model == IVYTOWN
|| cpu_model == HASWELLX
|| cpu_model == BDX_DE
|| cpu_model == BDX
|| cpu_model == IVY_BRIDGE
|| cpu_model == HASWELL
|| cpu_model == BROADWELL
|| original_cpu_model == ATOM_AVOTON
|| original_cpu_model == ATOM_APOLLO_LAKE
|| original_cpu_model == ATOM_DENVERTON
|| cpu_model == SKL
|| cpu_model == KBL
|| cpu_model == KNL
|| cpu_model == SKX
) ? (100000000ULL) : (133333333ULL);
nominal_frequency = ((freq >> 8) & 255) * bus_freq;
if(!nominal_frequency)
nominal_frequency = get_frequency_from_cpuid();
if(!nominal_frequency)
{
std::cerr << "Error: Can not detect core frequency." << std::endl;
destroyMSR();
return false;
}
#ifndef PCM_SILENT
std::cerr << "Nominal core frequency: " << nominal_frequency << " Hz" << std::endl;
#endif
}
return true;
}
void PCM::initEnergyMonitoring()
{
if(packageEnergyMetricsAvailable() && MSR.size())
{
uint64 rapl_power_unit = 0;
MSR[socketRefCore[0]]->read(MSR_RAPL_POWER_UNIT,&rapl_power_unit);
uint64 energy_status_unit = extract_bits(rapl_power_unit,8,12);
if (original_cpu_model == PCM::ATOM_CHERRYTRAIL || original_cpu_model == PCM::ATOM_BAYTRAIL)
joulesPerEnergyUnit = double(1ULL << energy_status_unit)/1000000.; // (2)^energy_status_unit microJoules
else
joulesPerEnergyUnit = 1./double(1ULL<<energy_status_unit); // (1/2)^energy_status_unit
//std::cout << "MSR_RAPL_POWER_UNIT: "<<energy_status_unit<<"; Joules/unit "<< joulesPerEnergyUnit << std::endl;
uint64 power_unit = extract_bits(rapl_power_unit,0,3);
double wattsPerPowerUnit = 1./double(1ULL<<power_unit);
uint64 package_power_info = 0;
MSR[socketRefCore[0]]->read(MSR_PKG_POWER_INFO,&package_power_info);
pkgThermalSpecPower = (int32) (double(extract_bits(package_power_info, 0, 14))*wattsPerPowerUnit);
pkgMinimumPower = (int32) (double(extract_bits(package_power_info, 16, 30))*wattsPerPowerUnit);
pkgMaximumPower = (int32) (double(extract_bits(package_power_info, 32, 46))*wattsPerPowerUnit);
#ifndef PCM_SILENT
std::cerr << "Package thermal spec power: "<< pkgThermalSpecPower << " Watt; ";
std::cerr << "Package minimum power: "<< pkgMinimumPower << " Watt; ";
std::cerr << "Package maximum power: "<< pkgMaximumPower << " Watt; " << std::endl;
#endif
int i = 0;
if(energy_status.empty())
for (i = 0; i < (int)num_sockets; ++i)
energy_status.push_back(
std::make_shared<CounterWidthExtender>(
new CounterWidthExtender::MsrHandleCounter(MSR[socketRefCore[i]], MSR_PKG_ENERGY_STATUS), 32, 10000));
if(dramEnergyMetricsAvailable() && dram_energy_status.empty())
for (i = 0; i < (int)num_sockets; ++i)
dram_energy_status.push_back(
std::make_shared<CounterWidthExtender>(
new CounterWidthExtender::MsrHandleCounter(MSR[socketRefCore[i]], MSR_DRAM_ENERGY_STATUS), 32, 10000));
}
}
void PCM::initUncoreObjects()
{
if (hasPCICFGUncore() && MSR.size())
{
int i = 0;
try
{
for (i = 0; i < (int)num_sockets; ++i)
{
server_pcicfg_uncore.push_back(std::make_shared<ServerPCICFGUncore>(i, this));
}
}
catch (...)
{
server_pcicfg_uncore.clear();
std::cerr << "Can not access Jaketown/Ivytown PCI configuration space. Access to uncore counters (memory and QPI bandwidth) is disabled." << std::endl;
#ifdef _MSC_VER
std::cerr << "You must have signed msr.sys driver in your current directory and have administrator rights to run this program." << std::endl;
#else
//std::cerr << "you must have read and write permissions for /proc/bus/pci/7f/10.* and /proc/bus/pci/ff/10.* devices (the 'chown' command can help)." << std::endl;
//std::cerr << "you must have read and write permissions for /dev/mem device (the 'chown' command can help)."<< std::endl;
//std::cerr << "you must have read permission for /sys/firmware/acpi/tables/MCFG device (the 'chmod' command can help)."<< std::endl;
std::cerr << "You must be root to access these Jaketown/Ivytown counters in PCM. " << std::endl;
#endif
}
} else if((cpu_model == SANDY_BRIDGE || cpu_model == IVY_BRIDGE || cpu_model == HASWELL || cpu_model == BROADWELL || cpu_model == SKL || cpu_model == KBL) && MSR.size())
{
// initialize memory bandwidth counting
try
{
clientBW = std::make_shared<ClientBW>();
clientImcReads = std::make_shared<CounterWidthExtender>(
new CounterWidthExtender::ClientImcReadsCounter(clientBW), 32, 10000);
clientImcWrites = std::make_shared<CounterWidthExtender>(
new CounterWidthExtender::ClientImcWritesCounter(clientBW), 32, 10000);
clientIoRequests = std::make_shared<CounterWidthExtender>(
new CounterWidthExtender::ClientIoRequestsCounter(clientBW), 32, 10000);
} catch(...)
{
std::cerr << "Can not read memory controller counter information from PCI configuration space. Access to memory bandwidth counters is not possible." << std::endl;
#ifdef _MSC_VER
// TODO: add message here
#endif
#ifdef __linux__
std::cerr << "You must be root to access these SandyBridge/IvyBridge/Haswell counters in PCM. " << std::endl;
#endif
}
}
if (useLinuxPerfForUncore())
{
initUncorePMUsPerf();
}
else
{
initUncorePMUsDirect();
}
}
void PCM::initUncorePMUsDirect()
{
for (uint32 s = 0; s < (uint32)num_sockets; ++s)
{
auto & handle = MSR[socketRefCore[s]];
uboxPMUs.push_back(
UncorePMU(
std::shared_ptr<MSRRegister>(),
std::make_shared<MSRRegister>(handle, UBOX_MSR_PMON_CTL0_ADDR),
std::make_shared<MSRRegister>(handle, UBOX_MSR_PMON_CTL1_ADDR),
std::shared_ptr<MSRRegister>(),
std::shared_ptr<MSRRegister>(),
std::make_shared<MSRRegister>(handle, UBOX_MSR_PMON_CTR0_ADDR),
std::make_shared<MSRRegister>(handle, UBOX_MSR_PMON_CTR1_ADDR),
std::shared_ptr<MSRRegister>(),
std::shared_ptr<MSRRegister>(),
std::make_shared<MSRRegister>(handle, UCLK_FIXED_CTL_ADDR),
std::make_shared<MSRRegister>(handle, UCLK_FIXED_CTR_ADDR)
)
);
switch (cpu_model)
{
case IVYTOWN:
case JAKETOWN:
pcuPMUs.push_back(
UncorePMU(
std::make_shared<MSRRegister>(handle, JKTIVT_PCU_MSR_PMON_BOX_CTL_ADDR),
std::make_shared<MSRRegister>(handle, JKTIVT_PCU_MSR_PMON_CTL0_ADDR),
std::make_shared<MSRRegister>(handle, JKTIVT_PCU_MSR_PMON_CTL1_ADDR),
std::make_shared<MSRRegister>(handle, JKTIVT_PCU_MSR_PMON_CTL2_ADDR),
std::make_shared<MSRRegister>(handle, JKTIVT_PCU_MSR_PMON_CTL3_ADDR),
std::make_shared<MSRRegister>(handle, JKTIVT_PCU_MSR_PMON_CTR0_ADDR),
std::make_shared<MSRRegister>(handle, JKTIVT_PCU_MSR_PMON_CTR1_ADDR),
std::make_shared<MSRRegister>(handle, JKTIVT_PCU_MSR_PMON_CTR2_ADDR),
std::make_shared<MSRRegister>(handle, JKTIVT_PCU_MSR_PMON_CTR3_ADDR),
std::shared_ptr<MSRRegister>(),
std::shared_ptr<MSRRegister>(),
std::make_shared<MSRRegister>(handle, JKTIVT_PCU_MSR_PMON_BOX_FILTER_ADDR)
)
);
break;
case BDX_DE:
case BDX:
case KNL:
case HASWELLX:
case SKX:
pcuPMUs.push_back(
UncorePMU(
std::make_shared<MSRRegister>(handle, HSX_PCU_MSR_PMON_BOX_CTL_ADDR),
std::make_shared<MSRRegister>(handle, HSX_PCU_MSR_PMON_CTL0_ADDR),
std::make_shared<MSRRegister>(handle, HSX_PCU_MSR_PMON_CTL1_ADDR),
std::make_shared<MSRRegister>(handle, HSX_PCU_MSR_PMON_CTL2_ADDR),
std::make_shared<MSRRegister>(handle, HSX_PCU_MSR_PMON_CTL3_ADDR),
std::make_shared<MSRRegister>(handle, HSX_PCU_MSR_PMON_CTR0_ADDR),
std::make_shared<MSRRegister>(handle, HSX_PCU_MSR_PMON_CTR1_ADDR),
std::make_shared<MSRRegister>(handle, HSX_PCU_MSR_PMON_CTR2_ADDR),
std::make_shared<MSRRegister>(handle, HSX_PCU_MSR_PMON_CTR3_ADDR),
std::shared_ptr<MSRRegister>(),
std::shared_ptr<MSRRegister>(),
std::make_shared<MSRRegister>(handle, HSX_PCU_MSR_PMON_BOX_FILTER_ADDR)
)
);
break;
}
}
// init IIO addresses
std::vector<int32> IIO_units;
IIO_units.push_back((int32)IIO_CBDMA);
IIO_units.push_back((int32)IIO_PCIe0);
IIO_units.push_back((int32)IIO_PCIe1);
IIO_units.push_back((int32)IIO_PCIe2);
IIO_units.push_back((int32)IIO_MCP0);
IIO_units.push_back((int32)IIO_MCP1);
if (IIOEventsAvailable())
{
iioPMUs.resize(num_sockets);
for (uint32 s = 0; s < (uint32)num_sockets; ++s)
{
auto & handle = MSR[socketRefCore[s]];
for (const auto & unit: IIO_units)
{
iioPMUs[s][unit] = UncorePMU(
std::make_shared<MSRRegister>(handle, SKX_IIO_CBDMA_UNIT_CTL + SKX_IIO_PM_REG_STEP * unit),
std::make_shared<MSRRegister>(handle, SKX_IIO_CBDMA_CTL0 + SKX_IIO_PM_REG_STEP * unit + 0),
std::make_shared<MSRRegister>(handle, SKX_IIO_CBDMA_CTL0 + SKX_IIO_PM_REG_STEP * unit + 1),
std::make_shared<MSRRegister>(handle, SKX_IIO_CBDMA_CTL0 + SKX_IIO_PM_REG_STEP * unit + 2),
std::make_shared<MSRRegister>(handle, SKX_IIO_CBDMA_CTL0 + SKX_IIO_PM_REG_STEP * unit + 3),
std::make_shared<MSRRegister>(handle, SKX_IIO_CBDMA_CTR0 + SKX_IIO_PM_REG_STEP * unit + 0),
std::make_shared<MSRRegister>(handle, SKX_IIO_CBDMA_CTR0 + SKX_IIO_PM_REG_STEP * unit + 1),
std::make_shared<MSRRegister>(handle, SKX_IIO_CBDMA_CTR0 + SKX_IIO_PM_REG_STEP * unit + 2),
std::make_shared<MSRRegister>(handle, SKX_IIO_CBDMA_CTR0 + SKX_IIO_PM_REG_STEP * unit + 3)
);
}
}
}
if (hasPCICFGUncore() && MSR.size())
{
cboPMUs.resize(num_sockets);
for (uint32 s = 0; s < (uint32)num_sockets; ++s)
{
auto & handle = MSR[socketRefCore[s]];
for (uint32 cbo = 0; cbo < getMaxNumOfCBoxes(); ++cbo)
{
cboPMUs[s].push_back(
UncorePMU(
std::make_shared<MSRRegister>(handle, CX_MSR_PMON_BOX_CTL(cbo)),
std::make_shared<MSRRegister>(handle, CX_MSR_PMON_CTLY(cbo, 0)),
std::make_shared<MSRRegister>(handle, CX_MSR_PMON_CTLY(cbo, 1)),
std::make_shared<MSRRegister>(handle, CX_MSR_PMON_CTLY(cbo, 2)),
std::make_shared<MSRRegister>(handle, CX_MSR_PMON_CTLY(cbo, 3)),
std::make_shared<CounterWidthExtenderRegister>(
std::make_shared<CounterWidthExtender>(new CounterWidthExtender::MsrHandleCounter(MSR[socketRefCore[s]], CX_MSR_PMON_CTRY(cbo, 0)), 48, 5555)),
std::make_shared<CounterWidthExtenderRegister>(
std::make_shared<CounterWidthExtender>(new CounterWidthExtender::MsrHandleCounter(MSR[socketRefCore[s]], CX_MSR_PMON_CTRY(cbo, 1)), 48, 5555)),
std::make_shared<CounterWidthExtenderRegister>(
std::make_shared<CounterWidthExtender>(new CounterWidthExtender::MsrHandleCounter(MSR[socketRefCore[s]], CX_MSR_PMON_CTRY(cbo, 2)), 48, 5555)),
std::make_shared<CounterWidthExtenderRegister>(
std::make_shared<CounterWidthExtender>(new CounterWidthExtender::MsrHandleCounter(MSR[socketRefCore[s]], CX_MSR_PMON_CTRY(cbo, 3)), 48, 5555)),
std::shared_ptr<MSRRegister>(),
std::shared_ptr<MSRRegister>(),
std::make_shared<MSRRegister>(handle, CX_MSR_PMON_BOX_FILTER(cbo)),
std::make_shared<MSRRegister>(handle, CX_MSR_PMON_BOX_FILTER1(cbo))
)
);
}
}
}
}
#ifdef PCM_USE_PERF
std::vector<int> enumeratePerfPMUs(const std::string & type, int max_id);
void populatePerfPMUs(unsigned socket_, const std::vector<int> & ids, std::vector<UncorePMU> & pmus, bool fixed, bool filter0 = false, bool filter1 = false);
#endif
void PCM::initUncorePMUsPerf()
{
#ifdef PCM_USE_PERF
iioPMUs.resize(num_sockets);
cboPMUs.resize(num_sockets);
for (uint32 s = 0; s < (uint32)num_sockets; ++s)
{
populatePerfPMUs(s, enumeratePerfPMUs("pcu", 100), pcuPMUs, false, true);
populatePerfPMUs(s, enumeratePerfPMUs("ubox", 100), uboxPMUs, true);
populatePerfPMUs(s, enumeratePerfPMUs("cbox", 100), cboPMUs[s], false, true, true);
populatePerfPMUs(s, enumeratePerfPMUs("cha", 200), cboPMUs[s], false, true, true);
std::vector<UncorePMU> iioPMUVector;
populatePerfPMUs(s, enumeratePerfPMUs("iio", 100), iioPMUVector, false);
for (size_t i = 0; i < iioPMUVector.size(); ++i)
{
iioPMUs[s][i] = iioPMUVector[i];
}
}
#endif
}
#ifdef __linux__
#define PCM_NMI_WATCHDOG_PATH "/proc/sys/kernel/nmi_watchdog"
bool isNMIWatchdogEnabled()
{
const auto watchdog = readSysFS(PCM_NMI_WATCHDOG_PATH);
if (watchdog.length() == 0)
{
return false;
}
return (std::atoi(watchdog.c_str()) == 1);
}
void disableNMIWatchdog()
{
std::cout << "Disabling NMI watchdog since it consumes one hw-PMU counter." << std::endl;
writeSysFS(PCM_NMI_WATCHDOG_PATH, "0");
}
void enableNMIWatchdog()
{
std::cout << " Re-enabling NMI watchdog." << std::endl;
writeSysFS(PCM_NMI_WATCHDOG_PATH, "1");
}
#endif
class CoreTaskQueue
{
std::queue<std::packaged_task<void()> > wQueue;
std::mutex m;
std::condition_variable condVar;
std::thread worker;
CoreTaskQueue() = delete;
CoreTaskQueue(CoreTaskQueue &) = delete;
public:
CoreTaskQueue(int32 core) :
worker([&]() {
TemporalThreadAffinity tempThreadAffinity(core);
std::unique_lock<std::mutex> lock(m);
while (1) {
while (wQueue.empty()) {
condVar.wait(lock);
}
while (!wQueue.empty()) {
wQueue.front()();
wQueue.pop();
}
}
})
{}
void push(std::packaged_task<void()> & task)
{
std::unique_lock<std::mutex> lock(m);
wQueue.push(std::move(task));
condVar.notify_one();
}
};
PCM::PCM() :
cpu_family(-1),
cpu_model(-1),
original_cpu_model(-1),
cpu_stepping(-1),
cpu_microcode_level(-1),
max_cpuid(-1),
threads_per_core(0),
num_cores(0),
num_sockets(0),
num_phys_cores_per_socket(0),
num_online_cores(0),
num_online_sockets(0),
core_gen_counter_num_max(0),
core_gen_counter_num_used(0), // 0 means no core gen counters used
core_gen_counter_width(0),
core_fixed_counter_num_max(0),
core_fixed_counter_num_used(0),
core_fixed_counter_width(0),
uncore_gen_counter_num_max(8),
uncore_gen_counter_num_used(0),
uncore_gen_counter_width(48),
uncore_fixed_counter_num_max(1),
uncore_fixed_counter_num_used(0),
uncore_fixed_counter_width(48),
perfmon_version(0),
perfmon_config_anythread(1),
nominal_frequency(0),
max_qpi_speed(0),
L3ScalingFactor(0),
pkgThermalSpecPower(-1),
pkgMinimumPower(-1),
pkgMaximumPower(-1),
allow_multiple_instances(false),
programmed_pmu(false),
joulesPerEnergyUnit(0),
disable_JKT_workaround(false),
blocked(false),
coreCStateMsr(NULL),
pkgCStateMsr(NULL),
L2CacheHitRatioAvailable(false),
L3CacheHitRatioAvailable(false),
L3CacheMissesAvailable(false),
L2CacheMissesAvailable(false),
L2CacheHitsAvailable(false),
L3CacheHitsNoSnoopAvailable(false),
L3CacheHitsSnoopAvailable(false),
L3CacheHitsAvailable(false),
CyclesLostDueL3CacheMissesAvailable(false),
CyclesLostDueL2CacheMissesAvailable(false),
forceRTMAbortMode(false),
mode(INVALID_MODE),
numInstancesSemaphore(NULL),
canUsePerf(false),
outfile(NULL),
backup_ofile(NULL),
run_state(1),
needToRestoreNMIWatchdog(false)
{
#ifdef _MSC_VER
TCHAR driverPath[1040]; // length for current directory + "\\msr.sys"
GetCurrentDirectory(1024, driverPath);
wcscat_s(driverPath, 1040, L"\\msr.sys");
// WARNING: This driver code (msr.sys) is only for testing purposes, not for production use
Driver drv;
// drv.stop(); // restart driver (usually not needed)
if (!drv.start(driverPath))
{
std::cerr << "Cannot access CPU counters" << std::endl;
std::cerr << "You must have signed msr.sys driver in your current directory and have administrator rights to run this program" << std::endl;
return;
}
#endif
if(!detectModel()) return;
if(!checkModel()) return;
initCStateSupportTables();
if(!discoverSystemTopology()) return;
if(!initMSR()) return;
readCoreCounterConfig();
#ifndef PCM_SILENT
printSystemTopology();
#endif
if(!detectNominalFrequency()) return;
showSpecControlMSRs();
#ifdef __linux__
if (isNMIWatchdogEnabled())
{
disableNMIWatchdog();
needToRestoreNMIWatchdog = true;
}
#endif
initEnergyMonitoring();
initUncoreObjects();
// Initialize RMID to the cores for QOS monitoring
initRMID();
readCPUMicrocodeLevel();
#ifdef PCM_USE_PERF
canUsePerf = true;
std::vector<int> dummy(PERF_MAX_COUNTERS, -1);
perfEventHandle.resize(num_cores, dummy);
#endif
for (int32 i = 0; i < num_cores; ++i)
{
coreTaskQueues.push_back(std::make_shared<CoreTaskQueue>(i));
}
}
void PCM::enableJKTWorkaround(bool enable)
{
if(disable_JKT_workaround) return;
std::cerr << "Using PCM on your system might have a performance impact as per http://software.intel.com/en-us/articles/performance-impact-when-sampling-certain-llc-events-on-snb-ep-with-vtune" << std::endl;
std::cerr << "You can avoid the performance impact by using the option --noJKTWA, however the cache metrics might be wrong then." << std::endl;
if(MSR.size())
{
for(int32 i = 0; i < num_cores; ++i)
{
uint64 val64 = 0;
MSR[i]->read(0x39C, &val64);
if(enable)
val64 |= 1ULL;
else
val64 &= (~1ULL);
MSR[i]->write(0x39C, val64);
}
}
for (size_t i = 0; i < (size_t)server_pcicfg_uncore.size(); ++i)
{
if(server_pcicfg_uncore[i].get()) server_pcicfg_uncore[i]->enableJKTWorkaround(enable);
}
}
void PCM::showSpecControlMSRs()
{
PCM_CPUID_INFO cpuinfo;
pcm_cpuid(7, 0, cpuinfo);
if (MSR.size())
{
if ((cpuinfo.reg.edx & (1 << 26)) || (cpuinfo.reg.edx & (1 << 27)))
{
uint64 val64 = 0;
MSR[0]->read(MSR_IA32_SPEC_CTRL, &val64);
std::cout << "IBRS enabled in the kernel : " << ((val64 & 1) ? "yes" : "no") << std::endl;
std::cout << "STIBP enabled in the kernel : " << ((val64 & 2) ? "yes" : "no") << std::endl;
}
if (cpuinfo.reg.edx & (1 << 29))
{
uint64 val64 = 0;
MSR[0]->read(MSR_IA32_ARCH_CAPABILITIES, &val64);
std::cout << "The processor is not susceptible to Rogue Data Cache Load: " << ((val64 & 1) ? "yes" : "no") << std::endl;
std::cout << "The processor supports enhanced IBRS : " << ((val64 & 2) ? "yes" : "no") << std::endl;
}
}
}
bool PCM::isCoreOnline(int32 os_core_id) const
{
return (topology[os_core_id].os_id != -1) && (topology[os_core_id].core_id != -1) && (topology[os_core_id].socket != -1);
}
bool PCM::isSocketOnline(int32 socket_id) const
{
return socketRefCore[socket_id] != -1;
}
bool PCM::isCPUModelSupported(int model_)
{
return ( model_ == NEHALEM_EP
|| model_ == NEHALEM_EX
|| model_ == WESTMERE_EP
|| model_ == WESTMERE_EX
|| model_ == ATOM
|| model_ == CLARKDALE
|| model_ == SANDY_BRIDGE
|| model_ == JAKETOWN
|| model_ == IVY_BRIDGE
|| model_ == HASWELL
|| model_ == IVYTOWN
|| model_ == HASWELLX
|| model_ == BDX_DE
|| model_ == BDX
|| model_ == BROADWELL
|| model_ == KNL
|| model_ == SKL
|| model_ == KBL
|| model_ == SKX
);
}
bool PCM::checkModel()
{
if (cpu_model == NEHALEM) cpu_model = NEHALEM_EP;
if ( cpu_model == ATOM_2
|| cpu_model == ATOM_CENTERTON
|| cpu_model == ATOM_BAYTRAIL
|| cpu_model == ATOM_AVOTON
|| cpu_model == ATOM_CHERRYTRAIL
|| cpu_model == ATOM_APOLLO_LAKE
|| cpu_model == ATOM_DENVERTON
) {
cpu_model = ATOM;
}
if (cpu_model == HASWELL_ULT || cpu_model == HASWELL_2) cpu_model = HASWELL;
if (cpu_model == BROADWELL_XEON_E3) cpu_model = BROADWELL;
if (cpu_model == SKL_UY) cpu_model = SKL;
if (cpu_model == KBL_1) cpu_model = KBL;
if(!isCPUModelSupported((int)cpu_model))
{
std::cerr << getUnsupportedMessage() << " CPU model number: " << cpu_model << " Brand: \"" << getCPUBrandString().c_str() <<"\""<< std::endl;
/* FOR TESTING PURPOSES ONLY */
#ifdef PCM_TEST_FALLBACK_TO_ATOM
std::cerr << "Fall back to ATOM functionality." << std::endl;
cpu_model = ATOM;
return true;
#endif
return false;
}
return true;
}
void PCM::destroyMSR()
{
MSR.clear();
}
PCM::~PCM()
{
InstanceLock lock(allow_multiple_instances);
if (instance)
{
destroyMSR();
instance = NULL;
}
}
bool PCM::good()
{
return !MSR.empty();
}
#ifdef PCM_USE_PERF
perf_event_attr PCM_init_perf_event_attr(bool group = true)
{
perf_event_attr e;
bzero(&e,sizeof(perf_event_attr));
e.type = -1; // must be set up later
e.size = sizeof(e);
e.config = -1; // must be set up later
e.sample_period = 0;
e.sample_type = 0;
e.read_format = group ? PERF_FORMAT_GROUP : 0; /* PERF_FORMAT_TOTAL_TIME_ENABLED | PERF_FORMAT_TOTAL_TIME_RUNNING |
PERF_FORMAT_ID | PERF_FORMAT_GROUP ; */
e.disabled = 0;
e.inherit = 0;
e.pinned = 1;
e.exclusive = 0;
e.exclude_user = 0;
e.exclude_kernel = 0;
e.exclude_hv = 0;
e.exclude_idle = 0;
e.mmap = 0;
e.comm = 0;
e.freq = 0;
e.inherit_stat = 0;
e.enable_on_exec = 0;
e.task = 0;
e.watermark = 0;
e.wakeup_events = 0;
return e;
}
#endif
PCM::ErrorCode PCM::program(const PCM::ProgramMode mode_, const void * parameter_)
{
if(allow_multiple_instances && (EXT_CUSTOM_CORE_EVENTS == mode_ || CUSTOM_CORE_EVENTS == mode_))
{
allow_multiple_instances = false;
std::cerr << "Warning: multiple PCM instance mode is not allowed with custom events." << std::endl;
}
InstanceLock lock(allow_multiple_instances);
if (MSR.empty()) return PCM::MSRAccessDenied;
ExtendedCustomCoreEventDescription * pExtDesc = (ExtendedCustomCoreEventDescription *)parameter_;
#ifdef PCM_USE_PERF
std::cerr << "Trying to use Linux perf events..." << std::endl;
const char * no_perf_env = std::getenv("PCM_NO_PERF");
if (no_perf_env != NULL && std::string(no_perf_env) == std::string("1"))
{
canUsePerf = false;
std::cout << "Usage of Linux perf events is disabled through PCM_NO_PERF environment variable. Using direct PMU programming..." << std::endl;
}
if(num_online_cores < num_cores)
{
canUsePerf = false;
std::cerr << "PCM does not support using Linux perf API on systems with offlined cores. Falling-back to direct PMU programming."
<< std::endl;
}
else if(PERF_COUNT_HW_MAX <= PCM_PERF_COUNT_HW_REF_CPU_CYCLES)
{
canUsePerf = false;
std::cerr << "Can not use Linux perf because your Linux kernel does not support PERF_COUNT_HW_REF_CPU_CYCLES event. Falling-back to direct PMU programming." << std::endl;
}
else if(EXT_CUSTOM_CORE_EVENTS == mode_ && pExtDesc && pExtDesc->fixedCfg)
{
canUsePerf = false;
std::cerr << "Can not use Linux perf because non-standard fixed counter configuration requested. Falling-back to direct PMU programming." << std::endl;
}
else if(EXT_CUSTOM_CORE_EVENTS == mode_ && pExtDesc && (pExtDesc->OffcoreResponseMsrValue[0] || pExtDesc->OffcoreResponseMsrValue[1]))
{
const std::string offcore_rsp_format = readSysFS("/sys/bus/event_source/devices/cpu/format/offcore_rsp");
if (offcore_rsp_format != "config1:0-63\n")
{
canUsePerf = false;
std::cerr << "Can not use Linux perf because OffcoreResponse usage is not supported. Falling-back to direct PMU programming." << std::endl;
}
}
#endif
if(allow_multiple_instances)
{
//std::cerr << "Checking for other instances of PCM..." << std::endl;
#ifdef _MSC_VER
numInstancesSemaphore = CreateSemaphore(NULL, 0, 1 << 20, L"Global\\Number of running Processor Counter Monitor instances");
if (!numInstancesSemaphore)
{
_com_error error(GetLastError());
std::wcerr << "Error in Windows function 'CreateSemaphore': " << GetLastError() << " ";
const TCHAR * strError = _com_error(GetLastError()).ErrorMessage();
if (strError) std::wcerr << strError;
std::wcerr << std::endl;
return PCM::UnknownError;
}
LONG prevValue = 0;
if (!ReleaseSemaphore(numInstancesSemaphore, 1, &prevValue))
{
_com_error error(GetLastError());
std::wcerr << "Error in Windows function 'ReleaseSemaphore': " << GetLastError() << " ";
const TCHAR * strError = _com_error(GetLastError()).ErrorMessage();
if (strError) std::wcerr << strError;
std::wcerr << std::endl;
return PCM::UnknownError;
}
if (prevValue > 0) // already programmed since another instance exists
{
std::cerr << "Number of PCM instances: " << (prevValue + 1) << std::endl;
if (hasPCICFGUncore() && max_qpi_speed==0)
for (size_t i = 0; i < (size_t)server_pcicfg_uncore.size(); ++i)
if (server_pcicfg_uncore[i].get())
max_qpi_speed = (std::max)(server_pcicfg_uncore[i]->computeQPISpeed(socketRefCore[i], cpu_model), max_qpi_speed); // parenthesis to avoid macro expansion on Windows
reportQPISpeed();
return PCM::Success;
}
#else // if linux, apple, freebsd or dragonflybsd
numInstancesSemaphore = sem_open(PCM_NUM_INSTANCES_SEMAPHORE_NAME, O_CREAT, S_IRWXU | S_IRWXG | S_IRWXO, 0);
if (SEM_FAILED == numInstancesSemaphore)
{
if (EACCES == errno)
std::cerr << "PCM Error, do not have permissions to open semaphores in /dev/shm/. Clean up them." << std::endl;
return PCM::UnknownError;
}
#ifndef __APPLE__
sem_post(numInstancesSemaphore);
int curValue = 0;
sem_getvalue(numInstancesSemaphore, &curValue);
#else //if it is apple
uint32 curValue = PCM::incrementNumInstances();
sem_post(numInstancesSemaphore);
#endif // end ifndef __APPLE__
if (curValue > 1) // already programmed since another instance exists
{
std::cerr << "Number of PCM instances: " << curValue << std::endl;
if (hasPCICFGUncore() && max_qpi_speed==0)
for (int i = 0; i < (int)server_pcicfg_uncore.size(); ++i) {
if(server_pcicfg_uncore[i].get())
max_qpi_speed = std::max(server_pcicfg_uncore[i]->computeQPISpeed(socketRefCore[i],cpu_model), max_qpi_speed);
reportQPISpeed();
}
if(!canUsePerf) return PCM::Success;
}
#endif // end ifdef _MSC_VER
#ifdef PCM_USE_PERF
/*
numInst>1 && canUsePerf==false -> not reachable, already PMU programmed in another PCM instance
numInst>1 && canUsePerf==true -> perf programmed in different PCM, is not allowed
numInst<=1 && canUsePerf==false -> we are first, perf cannot be used, *check* if PMU busy
numInst<=1 && canUsePerf==true -> we are first, perf will be used, *dont check*, this is now perf business
*/
if(curValue > 1 && (canUsePerf == true))
{
std::cerr << "Running several clients using the same counters is not posible with Linux perf. Recompile PCM without Linux Perf support to allow such usage. " << std::endl;
decrementInstanceSemaphore();
return PCM::UnknownError;
}
if((curValue <= 1) && (canUsePerf == false) && PMUinUse())
{
decrementInstanceSemaphore();
return PCM::PMUBusy;
}
#else
if (PMUinUse())
{
decrementInstanceSemaphore();
return PCM::PMUBusy;
}
#endif
}
else
{
if((canUsePerf == false) && PMUinUse())
{
return PCM::PMUBusy;
}
}
mode = mode_;
// copy custom event descriptions
if (mode == CUSTOM_CORE_EVENTS)
{
if (!parameter_)
{
std::cerr << "PCM Internal Error: data structure for custom event not initialized" << std::endl;
return PCM::UnknownError;
}
CustomCoreEventDescription * pDesc = (CustomCoreEventDescription *)parameter_;
coreEventDesc[0] = pDesc[0];
coreEventDesc[1] = pDesc[1];
if (cpu_model != ATOM && cpu_model != KNL)
{
coreEventDesc[2] = pDesc[2];
coreEventDesc[3] = pDesc[3];
core_gen_counter_num_used = 4;
}
else
core_gen_counter_num_used = 2;
}
else if (mode != EXT_CUSTOM_CORE_EVENTS)
{
switch ( cpu_model ) {
case ATOM:
case KNL:
coreEventDesc[0].event_number = ARCH_LLC_MISS_EVTNR;
coreEventDesc[0].umask_value = ARCH_LLC_MISS_UMASK;
coreEventDesc[1].event_number = ARCH_LLC_REFERENCE_EVTNR;
coreEventDesc[1].umask_value = ARCH_LLC_REFERENCE_UMASK;
L2CacheHitRatioAvailable = true;
L2CacheMissesAvailable = true;
L2CacheHitsAvailable = true;
core_gen_counter_num_used = 2;
break;
case SKL:
case SKX:
case KBL:
assert(useSkylakeEvents());
coreEventDesc[0].event_number = SKL_MEM_LOAD_RETIRED_L3_MISS_EVTNR;
coreEventDesc[0].umask_value = SKL_MEM_LOAD_RETIRED_L3_MISS_UMASK;
coreEventDesc[1].event_number = SKL_MEM_LOAD_RETIRED_L3_HIT_EVTNR;
coreEventDesc[1].umask_value = SKL_MEM_LOAD_RETIRED_L3_HIT_UMASK;
coreEventDesc[2].event_number = SKL_MEM_LOAD_RETIRED_L2_MISS_EVTNR;
coreEventDesc[2].umask_value = SKL_MEM_LOAD_RETIRED_L2_MISS_UMASK;
coreEventDesc[3].event_number = SKL_MEM_LOAD_RETIRED_L2_HIT_EVTNR;
coreEventDesc[3].umask_value = SKL_MEM_LOAD_RETIRED_L2_HIT_UMASK;
if (core_gen_counter_num_max == 3)
{
L3CacheHitRatioAvailable = true;
L3CacheMissesAvailable = true;
L2CacheMissesAvailable = true;
L3CacheHitsSnoopAvailable = true;
L3CacheHitsAvailable = true;
core_gen_counter_num_used = 3;
break;
}
L2CacheHitRatioAvailable = true;
L3CacheHitRatioAvailable = true;
L3CacheMissesAvailable = true;
L2CacheMissesAvailable = true;
L2CacheHitsAvailable = true;
L3CacheHitsSnoopAvailable = true;
L3CacheHitsAvailable = true;
core_gen_counter_num_used = 4;
break;
case SANDY_BRIDGE:
case JAKETOWN:
case IVYTOWN:
case IVY_BRIDGE:
case HASWELL:
case HASWELLX:
case BROADWELL:
case BDX_DE:
case BDX:
coreEventDesc[0].event_number = ARCH_LLC_MISS_EVTNR;
coreEventDesc[0].umask_value = ARCH_LLC_MISS_UMASK;
coreEventDesc[1].event_number = MEM_LOAD_UOPS_LLC_HIT_RETIRED_XSNP_NONE_EVTNR;
coreEventDesc[1].umask_value = MEM_LOAD_UOPS_LLC_HIT_RETIRED_XSNP_NONE_UMASK;
coreEventDesc[2].event_number = MEM_LOAD_UOPS_LLC_HIT_RETIRED_XSNP_EVTNR;
coreEventDesc[2].umask_value = MEM_LOAD_UOPS_LLC_HIT_RETIRED_XSNP_UMASK;
coreEventDesc[3].event_number = MEM_LOAD_UOPS_RETIRED_L2_HIT_EVTNR;
coreEventDesc[3].umask_value = MEM_LOAD_UOPS_RETIRED_L2_HIT_UMASK;
L2CacheHitRatioAvailable = true;
L3CacheHitRatioAvailable = true;
L3CacheMissesAvailable = true;
L2CacheMissesAvailable = true;
L2CacheHitsAvailable = true;
L3CacheHitsNoSnoopAvailable = true;
L3CacheHitsSnoopAvailable = true;
L3CacheHitsAvailable = true;
core_gen_counter_num_used = 4;
break;
case NEHALEM_EP:
case WESTMERE_EP:
case CLARKDALE:
coreEventDesc[0].event_number = MEM_LOAD_RETIRED_L3_MISS_EVTNR;
coreEventDesc[0].umask_value = MEM_LOAD_RETIRED_L3_MISS_UMASK;
coreEventDesc[1].event_number = MEM_LOAD_RETIRED_L3_UNSHAREDHIT_EVTNR;
coreEventDesc[1].umask_value = MEM_LOAD_RETIRED_L3_UNSHAREDHIT_UMASK;
coreEventDesc[2].event_number = MEM_LOAD_RETIRED_L2_HITM_EVTNR;
coreEventDesc[2].umask_value = MEM_LOAD_RETIRED_L2_HITM_UMASK;
coreEventDesc[3].event_number = MEM_LOAD_RETIRED_L2_HIT_EVTNR;
coreEventDesc[3].umask_value = MEM_LOAD_RETIRED_L2_HIT_UMASK;
L2CacheHitRatioAvailable = true;
L3CacheHitRatioAvailable = true;
L3CacheMissesAvailable = true;
L2CacheMissesAvailable = true;
L2CacheHitsAvailable = true;
L3CacheHitsNoSnoopAvailable = true;
L3CacheHitsSnoopAvailable = true;
L3CacheHitsAvailable = true;
core_gen_counter_num_used = 4;
default:
assert(!useSkylakeEvents());
coreEventDesc[0].event_number = ARCH_LLC_MISS_EVTNR;
coreEventDesc[0].umask_value = ARCH_LLC_MISS_UMASK;
coreEventDesc[1].event_number = MEM_LOAD_RETIRED_L3_UNSHAREDHIT_EVTNR;
coreEventDesc[1].umask_value = MEM_LOAD_RETIRED_L3_UNSHAREDHIT_UMASK;
coreEventDesc[2].event_number = MEM_LOAD_RETIRED_L2_HITM_EVTNR;
coreEventDesc[2].umask_value = MEM_LOAD_RETIRED_L2_HITM_UMASK;
coreEventDesc[3].event_number = MEM_LOAD_RETIRED_L2_HIT_EVTNR;
coreEventDesc[3].umask_value = MEM_LOAD_RETIRED_L2_HIT_UMASK;
L2CacheHitRatioAvailable = true;
L3CacheHitRatioAvailable = true;
L3CacheMissesAvailable = true;
L2CacheMissesAvailable = true;
L2CacheHitsAvailable = true;
L3CacheHitsNoSnoopAvailable = true;
L3CacheHitsSnoopAvailable = true;
L3CacheHitsAvailable = true;
core_gen_counter_num_used = 4;
}
}
core_fixed_counter_num_used = 3;
if(EXT_CUSTOM_CORE_EVENTS == mode_ && pExtDesc && pExtDesc->gpCounterCfg)
{
core_gen_counter_num_used = pExtDesc->nGPCounters;
}
if(cpu_model == JAKETOWN)
{
bool enableWA = false;
for(uint32 i = 0; i< core_gen_counter_num_used; ++i)
{
if(coreEventDesc[i].event_number == MEM_LOAD_UOPS_LLC_HIT_RETIRED_XSNP_EVTNR)
enableWA = true;
}
enableJKTWorkaround(enableWA); // this has a performance penalty on memory access
}
if (core_gen_counter_num_used > core_gen_counter_num_max)
{
std::cerr << "PCM ERROR: Trying to program " << core_gen_counter_num_used << " general purpose counters with only "
<< core_gen_counter_num_max << " available" << std::endl;
return PCM::UnknownError;
}
if (core_fixed_counter_num_used > core_fixed_counter_num_max)
{
std::cerr << "PCM ERROR: Trying to program " << core_fixed_counter_num_used << " fixed counters with only "
<< core_fixed_counter_num_max << " available" << std::endl;
return PCM::UnknownError;
}
programmed_pmu = true;
lastProgrammedCustomCounters.clear();
lastProgrammedCustomCounters.resize(num_cores);
// Version for linux/windows/freebsd/dragonflybsd
for (int i = 0; i < (int)num_cores; ++i)
{
TemporalThreadAffinity tempThreadAffinity(i); // speedup trick for Linux
const auto status = programCoreCounters(i, mode_, pExtDesc, lastProgrammedCustomCounters[i]);
if (status != PCM::Success)
{
return status;
}
// program uncore counters
if (cpu_model == NEHALEM_EP || cpu_model == WESTMERE_EP || cpu_model == CLARKDALE)
{
programNehalemEPUncore(i);
}
else if (hasBecktonUncore())
{
programBecktonUncore(i);
}
}
if(canUsePerf)
{
std::cerr << "Successfully programmed on-core PMU using Linux perf"<<std::endl;
}
if (hasPCICFGUncore())
{
std::vector<std::future<uint64>> qpi_speeds;
for (size_t i = 0; i < (size_t)server_pcicfg_uncore.size(); ++i)
{
server_pcicfg_uncore[i]->program();
qpi_speeds.push_back(std::move(std::async(std::launch::async,
&ServerPCICFGUncore::computeQPISpeed, server_pcicfg_uncore[i].get(), socketRefCore[i], cpu_model)));
}
for (size_t i = 0; i < (size_t)server_pcicfg_uncore.size(); ++i)
{
max_qpi_speed = (std::max)(qpi_speeds[i].get(), max_qpi_speed);
}
}
programLLCReadMissLatencyEvents();
reportQPISpeed();
return PCM::Success;
}
PCM::ErrorCode PCM::programCoreCounters(const int i /* core */,
const PCM::ProgramMode mode_,
const ExtendedCustomCoreEventDescription * pExtDesc,
std::vector<EventSelectRegister> & result)
{
// program core counters
result.clear();
FixedEventControlRegister ctrl_reg;
#ifdef PCM_USE_PERF
int leader_counter = -1;
perf_event_attr e = PCM_init_perf_event_attr();
if (canUsePerf)
{
e.type = PERF_TYPE_HARDWARE;
e.config = PERF_COUNT_HW_INSTRUCTIONS;
if ((perfEventHandle[i][PERF_INST_RETIRED_ANY_POS] = syscall(SYS_perf_event_open, &e, -1,
i /* core id */, leader_counter /* group leader */, 0)) <= 0)
{
std::cerr << "Linux Perf: Error on programming INST_RETIRED_ANY: " << strerror(errno) << std::endl;
if (errno == 24) std::cerr << "try executing 'ulimit -n 10000' to increase the limit on the number of open files." << std::endl;
decrementInstanceSemaphore();
return PCM::UnknownError;
}
leader_counter = perfEventHandle[i][PERF_INST_RETIRED_ANY_POS];
e.pinned = 0; // all following counter are not leaders, thus need not be pinned explicitly
e.config = PERF_COUNT_HW_CPU_CYCLES;
if ((perfEventHandle[i][PERF_CPU_CLK_UNHALTED_THREAD_POS] = syscall(SYS_perf_event_open, &e, -1,
i /* core id */, leader_counter /* group leader */, 0)) <= 0)
{
std::cerr << "Linux Perf: Error on programming CPU_CLK_UNHALTED_THREAD: " << strerror(errno) << std::endl;
if (errno == 24) std::cerr << "try executing 'ulimit -n 10000' to increase the limit on the number of open files." << std::endl;
decrementInstanceSemaphore();
return PCM::UnknownError;
}
e.config = PCM_PERF_COUNT_HW_REF_CPU_CYCLES;
if ((perfEventHandle[i][PERF_CPU_CLK_UNHALTED_REF_POS] = syscall(SYS_perf_event_open, &e, -1,
i /* core id */, leader_counter /* group leader */, 0)) <= 0)
{
std::cerr << "Linux Perf: Error on programming CPU_CLK_UNHALTED_REF: " << strerror(errno) << std::endl;
if (errno == 24) std::cerr << "try executing 'ulimit -n 10000' to increase the limit on the number of open files." << std::endl;
decrementInstanceSemaphore();
return PCM::UnknownError;
}
}
else
#endif
{
// disable counters while programming
MSR[i]->write(IA32_CR_PERF_GLOBAL_CTRL, 0);
MSR[i]->read(IA32_CR_FIXED_CTR_CTRL, &ctrl_reg.value);
if (EXT_CUSTOM_CORE_EVENTS == mode_ && pExtDesc && pExtDesc->fixedCfg)
{
ctrl_reg = *(pExtDesc->fixedCfg);
}
else
{
ctrl_reg.fields.os0 = 1;
ctrl_reg.fields.usr0 = 1;
ctrl_reg.fields.any_thread0 = 0;
ctrl_reg.fields.enable_pmi0 = 0;
ctrl_reg.fields.os1 = 1;
ctrl_reg.fields.usr1 = 1;
ctrl_reg.fields.any_thread1 = 0;
ctrl_reg.fields.enable_pmi1 = 0;
ctrl_reg.fields.os2 = 1;
ctrl_reg.fields.usr2 = 1;
ctrl_reg.fields.any_thread2 = 0;
ctrl_reg.fields.enable_pmi2 = 0;
ctrl_reg.fields.reserved1 = 0;
}
MSR[i]->write(IA32_CR_FIXED_CTR_CTRL, ctrl_reg.value);
}
if (EXT_CUSTOM_CORE_EVENTS == mode_ && pExtDesc)
{
if (pExtDesc->OffcoreResponseMsrValue[0]) // still need to do also if perf API is used due to a bug in perf
MSR[i]->write(MSR_OFFCORE_RSP0, pExtDesc->OffcoreResponseMsrValue[0]);
if (pExtDesc->OffcoreResponseMsrValue[1])
MSR[i]->write(MSR_OFFCORE_RSP1, pExtDesc->OffcoreResponseMsrValue[1]);
}
EventSelectRegister event_select_reg;
for (uint32 j = 0; j < core_gen_counter_num_used; ++j)
{
if (EXT_CUSTOM_CORE_EVENTS == mode_ && pExtDesc && pExtDesc->gpCounterCfg)
{
event_select_reg = pExtDesc->gpCounterCfg[j];
}
else
{
MSR[i]->read(IA32_PERFEVTSEL0_ADDR + j, &event_select_reg.value); // read-only also safe for perf
event_select_reg.fields.event_select = coreEventDesc[j].event_number;
event_select_reg.fields.umask = coreEventDesc[j].umask_value;
event_select_reg.fields.usr = 1;
event_select_reg.fields.os = 1;
event_select_reg.fields.edge = 0;
event_select_reg.fields.pin_control = 0;
event_select_reg.fields.apic_int = 0;
event_select_reg.fields.any_thread = 0;
event_select_reg.fields.enable = 1;
event_select_reg.fields.invert = 0;
event_select_reg.fields.cmask = 0;
event_select_reg.fields.in_tx = 0;
event_select_reg.fields.in_txcp = 0;
}
result.push_back(event_select_reg);
#ifdef PCM_USE_PERF
if (canUsePerf)
{
e.type = PERF_TYPE_RAW;
e.config = (1ULL << 63ULL) + event_select_reg.value;
if (event_select_reg.fields.event_select == OFFCORE_RESPONSE_0_EVTNR)
e.config1 = pExtDesc->OffcoreResponseMsrValue[0];
if (event_select_reg.fields.event_select == OFFCORE_RESPONSE_1_EVTNR)
e.config1 = pExtDesc->OffcoreResponseMsrValue[1];
if ((perfEventHandle[i][PERF_GEN_EVENT_0_POS + j] = syscall(SYS_perf_event_open, &e, -1,
i /* core id */, leader_counter /* group leader */, 0)) <= 0)
{
std::cerr << "Linux Perf: Error on programming generic event #" << i << " error: " << strerror(errno) << std::endl;
if (errno == 24) std::cerr << "try executing 'ulimit -n 10000' to increase the limit on the number of open files." << std::endl;
decrementInstanceSemaphore();
return PCM::UnknownError;
}
}
else
#endif
{
MSR[i]->write(IA32_PMC0 + j, 0);
MSR[i]->write(IA32_PERFEVTSEL0_ADDR + j, event_select_reg.value);
}
}
if (!canUsePerf)
{
// start counting, enable all (4 programmable + 3 fixed) counters
uint64 value = (1ULL << 0) + (1ULL << 1) + (1ULL << 2) + (1ULL << 3) + (1ULL << 32) + (1ULL << 33) + (1ULL << 34);
if (cpu_model == ATOM || cpu_model == KNL) // KNL and Atom have 3 fixed + only 2 programmable counters
value = (1ULL << 0) + (1ULL << 1) + (1ULL << 32) + (1ULL << 33) + (1ULL << 34);
MSR[i]->write(IA32_CR_PERF_GLOBAL_CTRL, value);
}
return PCM::Success;
}
void PCM::reportQPISpeed() const
{
if (!max_qpi_speed) return;
if (hasPCICFGUncore()) {
for (size_t i = 0; i < (size_t)server_pcicfg_uncore.size(); ++i)
{
std::cerr << "Socket " << i << std::endl;
if(server_pcicfg_uncore[i].get()) server_pcicfg_uncore[i]->reportQPISpeed();
}
} else {
std::cerr << "Max QPI speed: " << max_qpi_speed / (1e9) << " GBytes/second (" << max_qpi_speed / (1e9*getBytesPerLinkTransfer()) << " GT/second)" << std::endl;
}
}
void PCM::programNehalemEPUncore(int32 core)
{
#define CPUCNT_INIT_THE_REST_OF_EVTCNT \
unc_event_select_reg.fields.occ_ctr_rst = 1; \
unc_event_select_reg.fields.edge = 0; \
unc_event_select_reg.fields.enable_pmi = 0; \
unc_event_select_reg.fields.enable = 1; \
unc_event_select_reg.fields.invert = 0; \
unc_event_select_reg.fields.cmask = 0;
uncore_gen_counter_num_used = 8;
UncoreEventSelectRegister unc_event_select_reg;
MSR[core]->read(MSR_UNCORE_PERFEVTSEL0_ADDR, &unc_event_select_reg.value);
unc_event_select_reg.fields.event_select = UNC_QMC_WRITES_FULL_ANY_EVTNR;
unc_event_select_reg.fields.umask = UNC_QMC_WRITES_FULL_ANY_UMASK;
CPUCNT_INIT_THE_REST_OF_EVTCNT
MSR[core]->write(MSR_UNCORE_PERFEVTSEL0_ADDR, unc_event_select_reg.value);
MSR[core]->read(MSR_UNCORE_PERFEVTSEL1_ADDR, &unc_event_select_reg.value);
unc_event_select_reg.fields.event_select = UNC_QMC_NORMAL_READS_ANY_EVTNR;
unc_event_select_reg.fields.umask = UNC_QMC_NORMAL_READS_ANY_UMASK;
CPUCNT_INIT_THE_REST_OF_EVTCNT
MSR[core]->write(MSR_UNCORE_PERFEVTSEL1_ADDR, unc_event_select_reg.value);
MSR[core]->read(MSR_UNCORE_PERFEVTSEL2_ADDR, &unc_event_select_reg.value);
unc_event_select_reg.fields.event_select = UNC_QHL_REQUESTS_EVTNR;
unc_event_select_reg.fields.umask = UNC_QHL_REQUESTS_IOH_READS_UMASK;
CPUCNT_INIT_THE_REST_OF_EVTCNT
MSR[core]->write(MSR_UNCORE_PERFEVTSEL2_ADDR, unc_event_select_reg.value);
MSR[core]->read(MSR_UNCORE_PERFEVTSEL3_ADDR, &unc_event_select_reg.value);
unc_event_select_reg.fields.event_select = UNC_QHL_REQUESTS_EVTNR;
unc_event_select_reg.fields.umask = UNC_QHL_REQUESTS_IOH_WRITES_UMASK;
CPUCNT_INIT_THE_REST_OF_EVTCNT
MSR[core]->write(MSR_UNCORE_PERFEVTSEL3_ADDR, unc_event_select_reg.value);
MSR[core]->read(MSR_UNCORE_PERFEVTSEL4_ADDR, &unc_event_select_reg.value);
unc_event_select_reg.fields.event_select = UNC_QHL_REQUESTS_EVTNR;
unc_event_select_reg.fields.umask = UNC_QHL_REQUESTS_REMOTE_READS_UMASK;
CPUCNT_INIT_THE_REST_OF_EVTCNT
MSR[core]->write(MSR_UNCORE_PERFEVTSEL4_ADDR, unc_event_select_reg.value);
MSR[core]->read(MSR_UNCORE_PERFEVTSEL5_ADDR, &unc_event_select_reg.value);
unc_event_select_reg.fields.event_select = UNC_QHL_REQUESTS_EVTNR;
unc_event_select_reg.fields.umask = UNC_QHL_REQUESTS_REMOTE_WRITES_UMASK;
CPUCNT_INIT_THE_REST_OF_EVTCNT
MSR[core]->write(MSR_UNCORE_PERFEVTSEL5_ADDR, unc_event_select_reg.value);
MSR[core]->read(MSR_UNCORE_PERFEVTSEL6_ADDR, &unc_event_select_reg.value);
unc_event_select_reg.fields.event_select = UNC_QHL_REQUESTS_EVTNR;
unc_event_select_reg.fields.umask = UNC_QHL_REQUESTS_LOCAL_READS_UMASK;
CPUCNT_INIT_THE_REST_OF_EVTCNT
MSR[core]->write(MSR_UNCORE_PERFEVTSEL6_ADDR, unc_event_select_reg.value);
MSR[core]->read(MSR_UNCORE_PERFEVTSEL7_ADDR, &unc_event_select_reg.value);
unc_event_select_reg.fields.event_select = UNC_QHL_REQUESTS_EVTNR;
unc_event_select_reg.fields.umask = UNC_QHL_REQUESTS_LOCAL_WRITES_UMASK;
CPUCNT_INIT_THE_REST_OF_EVTCNT
MSR[core]->write(MSR_UNCORE_PERFEVTSEL7_ADDR, unc_event_select_reg.value);
#undef CPUCNT_INIT_THE_REST_OF_EVTCNT
// start uncore counting
uint64 value = 255 + (1ULL << 32); // enable all counters
MSR[core]->write(MSR_UNCORE_PERF_GLOBAL_CTRL_ADDR, value);
// synchronise counters
MSR[core]->write(MSR_UNCORE_PMC0, 0);
MSR[core]->write(MSR_UNCORE_PMC1, 0);
MSR[core]->write(MSR_UNCORE_PMC2, 0);
MSR[core]->write(MSR_UNCORE_PMC3, 0);
MSR[core]->write(MSR_UNCORE_PMC4, 0);
MSR[core]->write(MSR_UNCORE_PMC5, 0);
MSR[core]->write(MSR_UNCORE_PMC6, 0);
MSR[core]->write(MSR_UNCORE_PMC7, 0);
}
void PCM::programBecktonUncore(int32 core)
{
// program Beckton uncore
if (core == socketRefCore[0]) computeQPISpeedBeckton((int)core);
uint64 value = 1 << 29ULL; // reset all counters
MSR[core]->write(U_MSR_PMON_GLOBAL_CTL, value);
BecktonUncorePMUZDPCTLFVCRegister FVCreg;
FVCreg.value = 0;
if (cpu_model == NEHALEM_EX)
{
FVCreg.fields.bcmd = 0; // rd_bcmd
FVCreg.fields.resp = 0; // ack_resp
FVCreg.fields.evnt0 = 5; // bcmd_match
FVCreg.fields.evnt1 = 6; // resp_match
FVCreg.fields.pbox_init_err = 0;
}
else
{
FVCreg.fields_wsm.bcmd = 0; // rd_bcmd
FVCreg.fields_wsm.resp = 0; // ack_resp
FVCreg.fields_wsm.evnt0 = 5; // bcmd_match
FVCreg.fields_wsm.evnt1 = 6; // resp_match
FVCreg.fields_wsm.pbox_init_err = 0;
}
MSR[core]->write(MB0_MSR_PMU_ZDP_CTL_FVC, FVCreg.value);
MSR[core]->write(MB1_MSR_PMU_ZDP_CTL_FVC, FVCreg.value);
BecktonUncorePMUCNTCTLRegister CNTCTLreg;
CNTCTLreg.value = 0;
CNTCTLreg.fields.en = 1;
CNTCTLreg.fields.pmi_en = 0;
CNTCTLreg.fields.count_mode = 0;
CNTCTLreg.fields.storage_mode = 0;
CNTCTLreg.fields.wrap_mode = 1;
CNTCTLreg.fields.flag_mode = 0;
CNTCTLreg.fields.inc_sel = 0x0d; // FVC_EV0
MSR[core]->write(MB0_MSR_PMU_CNT_CTL_0, CNTCTLreg.value);
MSR[core]->write(MB1_MSR_PMU_CNT_CTL_0, CNTCTLreg.value);
CNTCTLreg.fields.inc_sel = 0x0e; // FVC_EV1
MSR[core]->write(MB0_MSR_PMU_CNT_CTL_1, CNTCTLreg.value);
MSR[core]->write(MB1_MSR_PMU_CNT_CTL_1, CNTCTLreg.value);
value = 1 + ((0x0C) << 1ULL); // enable bit + (event select IMT_INSERTS_WR)
MSR[core]->write(BB0_MSR_PERF_CNT_CTL_1, value);
MSR[core]->write(BB1_MSR_PERF_CNT_CTL_1, value);
MSR[core]->write(MB0_MSR_PERF_GLOBAL_CTL, 3); // enable two counters
MSR[core]->write(MB1_MSR_PERF_GLOBAL_CTL, 3); // enable two counters
MSR[core]->write(BB0_MSR_PERF_GLOBAL_CTL, 2); // enable second counter
MSR[core]->write(BB1_MSR_PERF_GLOBAL_CTL, 2); // enable second counter
// program R-Box to monitor QPI traffic
// enable counting on all counters on the left side (port 0-3)
MSR[core]->write(R_MSR_PMON_GLOBAL_CTL_7_0, 255);
// ... on the right side (port 4-7)
MSR[core]->write(R_MSR_PMON_GLOBAL_CTL_15_8, 255);
// pick the event
value = (1 << 7ULL) + (1 << 6ULL) + (1 << 2ULL); // count any (incoming) data responses
MSR[core]->write(R_MSR_PORT0_IPERF_CFG0, value);
MSR[core]->write(R_MSR_PORT1_IPERF_CFG0, value);
MSR[core]->write(R_MSR_PORT4_IPERF_CFG0, value);
MSR[core]->write(R_MSR_PORT5_IPERF_CFG0, value);
// pick the event
value = (1ULL << 30ULL); // count null idle flits sent
MSR[core]->write(R_MSR_PORT0_IPERF_CFG1, value);
MSR[core]->write(R_MSR_PORT1_IPERF_CFG1, value);
MSR[core]->write(R_MSR_PORT4_IPERF_CFG1, value);
MSR[core]->write(R_MSR_PORT5_IPERF_CFG1, value);
// choose counter 0 to monitor R_MSR_PORT0_IPERF_CFG0
MSR[core]->write(R_MSR_PMON_CTL0, 1 + 2 * (0));
// choose counter 1 to monitor R_MSR_PORT1_IPERF_CFG0
MSR[core]->write(R_MSR_PMON_CTL1, 1 + 2 * (6));
// choose counter 8 to monitor R_MSR_PORT4_IPERF_CFG0
MSR[core]->write(R_MSR_PMON_CTL8, 1 + 2 * (0));
// choose counter 9 to monitor R_MSR_PORT5_IPERF_CFG0
MSR[core]->write(R_MSR_PMON_CTL9, 1 + 2 * (6));
// choose counter 2 to monitor R_MSR_PORT0_IPERF_CFG1
MSR[core]->write(R_MSR_PMON_CTL2, 1 + 2 * (1));
// choose counter 3 to monitor R_MSR_PORT1_IPERF_CFG1
MSR[core]->write(R_MSR_PMON_CTL3, 1 + 2 * (7));
// choose counter 10 to monitor R_MSR_PORT4_IPERF_CFG1
MSR[core]->write(R_MSR_PMON_CTL10, 1 + 2 * (1));
// choose counter 11 to monitor R_MSR_PORT5_IPERF_CFG1
MSR[core]->write(R_MSR_PMON_CTL11, 1 + 2 * (7));
// enable uncore TSC counter (fixed one)
MSR[core]->write(W_MSR_PMON_GLOBAL_CTL, 1ULL << 31ULL);
MSR[core]->write(W_MSR_PMON_FIXED_CTR_CTL, 1ULL);
value = (1 << 28ULL) + 1; // enable all counters
MSR[core]->write(U_MSR_PMON_GLOBAL_CTL, value);
}
uint64 RDTSC();
void PCM::computeNominalFrequency()
{
const int ref_core = 0;
uint64 before = 0, after = 0;
MSR[ref_core]->read(IA32_TIME_STAMP_COUNTER, &before);
MySleepMs(1000);
MSR[ref_core]->read(IA32_TIME_STAMP_COUNTER, &after);
nominal_frequency = after-before;
}
std::string PCM::getCPUBrandString()
{
char buffer[sizeof(int)*4*3+1];
PCM_CPUID_INFO * info = (PCM_CPUID_INFO *) buffer;
pcm_cpuid(0x80000002, *info);
++info;
pcm_cpuid(0x80000003, *info);
++info;
pcm_cpuid(0x80000004, *info);
buffer[sizeof(int)*4*3] = 0;
std::string result(buffer);
while(result[0]==' ') result.erase(0,1);
std::string::size_type i;
while((i = result.find(" ")) != std::string::npos) result.replace(i,2," "); // remove duplicate spaces
return result;
}
std::string PCM::getCPUFamilyModelString()
{
char buffer[sizeof(int)*4*3+6];
memset(buffer,0,sizeof(buffer));
#ifdef _MSC_VER
sprintf_s(buffer,sizeof(buffer),"GenuineIntel-%d-%2X-%X",this->cpu_family,this->original_cpu_model,this->cpu_stepping);
#else
snprintf(buffer,sizeof(buffer),"GenuineIntel-%d-%2X-%X",this->cpu_family,this->original_cpu_model,this->cpu_stepping);
#endif
std::string result(buffer);
return result;
}
void PCM::enableForceRTMAbortMode()
{
// std::cout << "enableForceRTMAbortMode(): forceRTMAbortMode=" << forceRTMAbortMode << std::endl;
if (!forceRTMAbortMode)
{
if (isForceRTMAbortModeAvailable() && (core_gen_counter_num_max < 4))
{
for (auto m : MSR)
{
const auto res = m->write(MSR_TSX_FORCE_ABORT, 1);
if (res != sizeof(uint64))
{
std::cerr << "Warning: writing 1 to MSR_TSX_FORCE_ABORT failed with error "
<< res << " on core "<< m->getCoreId() << std::endl;
}
}
readCoreCounterConfig(); // re-read core_gen_counter_num_max from CPUID
std::cout << "The number of custom counters is now "<< core_gen_counter_num_max << std::endl;
if (core_gen_counter_num_max < 4)
{
std::cerr << "PCM Warning: the number of custom counters did not increase (" << core_gen_counter_num_max << ")" << std::endl;
}
forceRTMAbortMode = true;
}
}
}
bool PCM::isForceRTMAbortModeEnabled() const
{
return forceRTMAbortMode;
}
void PCM::disableForceRTMAbortMode()
{
// std::cout << "disableForceRTMAbortMode(): forceRTMAbortMode=" << forceRTMAbortMode << std::endl;
if (forceRTMAbortMode)
{
for (auto m : MSR)
{
const auto res = m->write(MSR_TSX_FORCE_ABORT, 0);
if (res != sizeof(uint64))
{
std::cerr << "Warning: writing 0 to MSR_TSX_FORCE_ABORT failed with error "
<< res << " on core " << m->getCoreId() << std::endl;
}
}
readCoreCounterConfig(); // re-read core_gen_counter_num_max from CPUID
std::cout << "The number of custom counters is now " << core_gen_counter_num_max << std::endl;
if (core_gen_counter_num_max != 3)
{
std::cerr << "PCM Warning: the number of custom counters is not 3 (" << core_gen_counter_num_max << ")" << std::endl;
}
forceRTMAbortMode = false;
}
}
bool PCM::isForceRTMAbortModeAvailable() const
{
PCM_CPUID_INFO info;
pcm_cpuid(7, 0, info); // leaf 7, subleaf 0
return (info.reg.edx & (0x1 << 13)) ? true : false;
}
uint64 get_frequency_from_cpuid() // from Pat Fay (Intel)
{
double speed=0;
std::string brand = PCM::getCPUBrandString();
if (brand.length() > std::string::size_type(0))
{
std::string::size_type unitsg = brand.find("GHz");
if(unitsg != std::string::npos)
{
std::string::size_type atsign = brand.rfind(' ', unitsg);
if(atsign != std::string::npos)
{
std::istringstream(brand.substr(atsign)) >> speed;
speed *= 1000;
}
}
else
{
std::string::size_type unitsg = brand.find("MHz");
if(unitsg != std::string::npos)
{
std::string::size_type atsign = brand.rfind(' ', unitsg);
if(atsign != std::string::npos)
{
std::istringstream(brand.substr(atsign)) >> speed;
}
}
}
}
return (uint64)(speed * 1000. * 1000.);
}
std::string PCM::getSupportedUarchCodenames() const
{
std::ostringstream ostr;
for(int32 i=0; i < static_cast<int32>(PCM::END_OF_MODEL_LIST) ; ++i)
if(isCPUModelSupported((int)i))
ostr << getUArchCodename(i) << ", ";
return std::string(ostr.str().substr(0, ostr.str().length() - 2));
}
std::string PCM::getUnsupportedMessage() const
{
std::ostringstream ostr;
ostr << "Error: unsupported processor. Only Intel(R) processors are supported (Atom(R) and microarchitecture codename "<< getSupportedUarchCodenames() <<").";
return std::string(ostr.str());
}
void PCM::computeQPISpeedBeckton(int core_nr)
{
uint64 startFlits = 0;
// reset all counters
MSR[core_nr]->write(U_MSR_PMON_GLOBAL_CTL, 1 << 29ULL);
// enable counting on all counters on the left side (port 0-3)
MSR[core_nr]->write(R_MSR_PMON_GLOBAL_CTL_7_0, 255);
// disable on the right side (port 4-7)
MSR[core_nr]->write(R_MSR_PMON_GLOBAL_CTL_15_8, 0);
// count flits sent
MSR[core_nr]->write(R_MSR_PORT0_IPERF_CFG0, 1ULL << 31ULL);
// choose counter 0 to monitor R_MSR_PORT0_IPERF_CFG0
MSR[core_nr]->write(R_MSR_PMON_CTL0, 1 + 2 * (0));
// enable all counters
MSR[core_nr]->write(U_MSR_PMON_GLOBAL_CTL, (1 << 28ULL) + 1);
MSR[core_nr]->read(R_MSR_PMON_CTR0, &startFlits);
const uint64 timerGranularity = 1000000ULL; // mks
uint64 startTSC = getTickCount(timerGranularity, (uint32) core_nr);
uint64 endTSC;
do
{
endTSC = getTickCount(timerGranularity, (uint32) core_nr);
} while (endTSC - startTSC < 200000ULL); // spin for 200 ms
uint64 endFlits = 0;
MSR[core_nr]->read(R_MSR_PMON_CTR0, &endFlits);
max_qpi_speed = (endFlits - startFlits) * 8ULL * timerGranularity / (endTSC - startTSC);
}
uint32 PCM::checkCustomCoreProgramming(std::shared_ptr<SafeMsrHandle> msr)
{
const auto core = msr->getCoreId();
if (size_t(core) >= lastProgrammedCustomCounters.size() || canUsePerf)
{
// checking 'canUsePerf'because corruption detection curently works
// only if perf is not used, see https://github.com/opcm/pcm/issues/106
return 0;
}
uint32 corruptedCountersMask = 0;
for (size_t ctr = 0; ctr < lastProgrammedCustomCounters[core].size(); ++ctr)
{
EventSelectRegister current;
if (msr->read(IA32_PERFEVTSEL0_ADDR + ctr, &current.value) != sizeof(current.value))
{
std::cerr << "PCM Error: can not read MSR 0x" << std::hex << (IA32_PERFEVTSEL0_ADDR + ctr) <<
" on core " << std::dec << core << std::endl;
continue;
}
if (canUsePerf)
{
current.fields.apic_int = 0; // perf sets this bit
}
if (current.value != lastProgrammedCustomCounters[core][ctr].value)
{
std::cerr << "PCM Error: someone has corrupted custom counter " << ctr << " on core " << core
<< " expected value " << lastProgrammedCustomCounters[core][ctr].value << " value read "
<< current.value << std::endl;
corruptedCountersMask |= (1<<ctr);
}
}
return corruptedCountersMask;
}
bool PCM::PMUinUse()
{
// follow the "Performance Monitoring Unit Sharing Guide" by P. Irelan and Sh. Kuo
for (int i = 0; i < (int)num_cores; ++i)
{
//std::cout << "Core "<<i<<" exemine registers"<< std::endl;
uint64 value = 0;
if (perfmon_version >= 4)
{
MSR[i]->read(MSR_PERF_GLOBAL_INUSE, &value);
for (uint32 j = 0; j < core_gen_counter_num_max; ++j)
{
if (value & (1ULL << j))
{
std::cerr << "WARNING: Custom counter " << j << " is in use. MSR_PERF_GLOBAL_INUSE on core " << i << ": 0x" << std::hex << value << std::dec << std::endl;
/*
Testing MSR_PERF_GLOBAL_INUSE mechanism for a moment. At a later point in time will report BUSY.
return true;
*/
}
}
}
MSR[i]->read(IA32_CR_PERF_GLOBAL_CTRL, &value);
// std::cout << "Core "<<i<<" IA32_CR_PERF_GLOBAL_CTRL is "<< std::hex << value << std::dec << std::endl;
EventSelectRegister event_select_reg;
event_select_reg.value = 0xFFFFFFFFFFFFFFFF;
for (uint32 j = 0; j < core_gen_counter_num_max; ++j)
{
MSR[i]->read(IA32_PERFEVTSEL0_ADDR + j, &event_select_reg.value);
if (event_select_reg.fields.event_select != 0 || event_select_reg.fields.apic_int != 0)
{
std::cerr << "WARNING: Core "<<i<<" IA32_PERFEVTSEL0_ADDR are not zeroed "<< event_select_reg.value << std::endl;
return true;
}
}
FixedEventControlRegister ctrl_reg;
ctrl_reg.value = 0xffffffffffffffff;
MSR[i]->read(IA32_CR_FIXED_CTR_CTRL, &ctrl_reg.value);
// Check if someone has installed pmi handler on counter overflow.
// If so, that agent might potentially need to change counter value
// for the "sample after"-mode messing up PCM measurements
if(ctrl_reg.fields.enable_pmi0 || ctrl_reg.fields.enable_pmi1 || ctrl_reg.fields.enable_pmi2)
{
std::cerr << "WARNING: Core "<<i<<" fixed ctrl:"<< ctrl_reg.value << std::endl;
return true;
}
// either os=0,usr=0 (not running) or os=1,usr=1 (fits PCM modus) are ok, other combinations are not
if(ctrl_reg.fields.os0 != ctrl_reg.fields.usr0 ||
ctrl_reg.fields.os1 != ctrl_reg.fields.usr1 ||
ctrl_reg.fields.os2 != ctrl_reg.fields.usr2)
{
std::cerr << "WARNING: Core "<<i<<" fixed ctrl:"<< ctrl_reg.value << std::endl;
return true;
}
}
return false;
}
const char * PCM::getUArchCodename(const int32 cpu_model_param) const
{
auto cpu_model_ = cpu_model_param;
if(cpu_model_ < 0)
cpu_model_ = this->cpu_model ;
switch(cpu_model_)
{
case NEHALEM_EP:
case NEHALEM:
return "Nehalem/Nehalem-EP";
case ATOM:
return "Atom(tm)";
case CLARKDALE:
return "Westmere/Clarkdale";
case WESTMERE_EP:
return "Westmere-EP";
case NEHALEM_EX:
return "Nehalem-EX";
case WESTMERE_EX:
return "Westmere-EX";
case SANDY_BRIDGE:
return "Sandy Bridge";
case JAKETOWN:
return "Sandy Bridge-EP/Jaketown";
case IVYTOWN:
return "Ivy Bridge-EP/EN/EX/Ivytown";
case HASWELLX:
return "Haswell-EP/EN/EX";
case BDX_DE:
return "Broadwell-DE";
case BDX:
return "Broadwell-EP/EX";
case KNL:
return "Knights Landing";
case IVY_BRIDGE:
return "Ivy Bridge";
case HASWELL:
return "Haswell";
case BROADWELL:
return "Broadwell";
case SKL:
return "Skylake";
case KBL:
return "Kabylake";
case SKX:
if (cpu_model_param >= 0)
{
// query for specified cpu_model_param, stepping not provided
return "Skylake-SP, Cascade Lake-SP";
}
if (isCLX())
{
return "Cascade Lake-SP";
}
return "Skylake-SP";
}
return "unknown";
}
void PCM::cleanupPMU()
{
#ifdef PCM_USE_PERF
if(canUsePerf)
{
for (int i = 0; i < num_cores; ++i)
for(int c = 0; c < PERF_MAX_COUNTERS; ++c)
::close(perfEventHandle[i][c]);
return;
}
#endif
// follow the "Performance Monitoring Unit Sharing Guide" by P. Irelan and Sh. Kuo
for (int i = 0; i < (int)num_cores; ++i)
{
// disable generic counters and continue free running counting for fixed counters
MSR[i]->write(IA32_CR_PERF_GLOBAL_CTRL, (1ULL << 32) + (1ULL << 33) + (1ULL << 34));
for (uint32 j = 0; j < core_gen_counter_num_max; ++j)
{
MSR[i]->write(IA32_PERFEVTSEL0_ADDR + j, 0);
}
}
if(cpu_model == JAKETOWN)
enableJKTWorkaround(false);
#ifndef PCM_SILENT
std::cerr << " Zeroed PMU registers" << std::endl;
#endif
}
void PCM::cleanupUncorePMUs()
{
for (auto & sPMUs : iioPMUs)
{
for (auto & pmu : sPMUs)
{
pmu.second.cleanup();
}
}
for (auto & sCBOPMUs : cboPMUs)
{
for (auto & pmu : sCBOPMUs)
{
pmu.cleanup();
}
}
for (auto & pmu : pcuPMUs)
{
pmu.cleanup();
}
for (auto & uncore : server_pcicfg_uncore)
{
uncore->cleanupPMUs();
}
#ifndef PCM_SILENT
std::cerr << " Zeroed uncore PMU registers" << std::endl;
#endif
}
void PCM::resetPMU()
{
for (int i = 0; i < (int)num_cores; ++i)
{
// disable all counters
MSR[i]->write(IA32_CR_PERF_GLOBAL_CTRL, 0);
for (uint32 j = 0; j < core_gen_counter_num_max; ++j)
{
MSR[i]->write(IA32_PERFEVTSEL0_ADDR + j, 0);
}
FixedEventControlRegister ctrl_reg;
ctrl_reg.value = 0xffffffffffffffff;
MSR[i]->read(IA32_CR_FIXED_CTR_CTRL, &ctrl_reg.value);
if ((ctrl_reg.fields.os0 ||
ctrl_reg.fields.usr0 ||
ctrl_reg.fields.enable_pmi0 ||
ctrl_reg.fields.os1 ||
ctrl_reg.fields.usr1 ||
ctrl_reg.fields.enable_pmi1 ||
ctrl_reg.fields.os2 ||
ctrl_reg.fields.usr2 ||
ctrl_reg.fields.enable_pmi2)
!= 0)
MSR[i]->write(IA32_CR_FIXED_CTR_CTRL, 0);
}
#ifndef PCM_SILENT
std::cerr << " Zeroed PMU registers" << std::endl;
#endif
}
void PCM::freeRMID()
{
if(!(QOSMetricAvailable() && L3QOSMetricAvailable())) {
return;
}
for(int32 core = 0; core < num_cores; core ++ )
{
if(!isCoreOnline(core)) continue;
uint64 msr_pqr_assoc = 0 ;
uint64 msr_qm_evtsel = 0;
int32 rmid = 0;
int32 event = 0;
//Read 0xC8F MSR for each core
MSR[core]->read(IA32_PQR_ASSOC, &msr_pqr_assoc);
msr_pqr_assoc &= 0xffffffff00000000ULL;
//Write 0xC8F MSR with RMID 0
MSR[core]->write(IA32_PQR_ASSOC,msr_pqr_assoc);
msr_qm_evtsel = rmid & ((1ULL<<10)-1ULL) ;
msr_qm_evtsel <<= 32 ;
msr_qm_evtsel |= event & ((1ULL<<8)-1ULL);
//Write Event Id as 0 and RMID 0 to the MSR for each core
MSR[core]->write(IA32_QM_EVTSEL,msr_qm_evtsel);
}
std::cerr << " Freeing up all RMIDs" << std::endl;
}
void PCM::setOutput(const std::string filename)
{
outfile = new std::ofstream(filename.c_str());
backup_ofile = std::cout.rdbuf();
std::cout.rdbuf(outfile->rdbuf());
}
void PCM::restoreOutput()
{
// restore cout back to what it was originally
if(backup_ofile)
std::cout.rdbuf(backup_ofile);
// close output file
if(outfile)
outfile->close();
}
void PCM::cleanup()
{
InstanceLock lock(allow_multiple_instances);
if (MSR.empty()) return;
std::cerr << "Cleaning up" << std::endl;
if (decrementInstanceSemaphore())
cleanupPMU();
disableForceRTMAbortMode();
cleanupUncorePMUs();
freeRMID();
#ifdef __linux__
if (needToRestoreNMIWatchdog)
{
enableNMIWatchdog();
needToRestoreNMIWatchdog = false;
}
#endif
}
// hle is only available when cpuid has this:
// HLE: CPUID.07H.EBX.HLE [bit 4] = 1
bool PCM::supportsHLE() const
{
PCM_CPUID_INFO info;
pcm_cpuid(7, 0, info); // leaf 7, subleaf 0
return (info.reg.ebx & (0x1 << 4)) ? true : false;
}
// rtm is only available when cpuid has this:
// RTM: CPUID.07H.EBX.RTM [bit 11] = 1
bool PCM::supportsRTM() const
{
PCM_CPUID_INFO info;
pcm_cpuid(7, 0, info); // leaf 7, subleaf 0
return (info.reg.ebx & (0x1 << 11)) ? true : false;
}
#ifdef __APPLE__
uint32 PCM::getNumInstances()
{
return MSR[0]->getNumInstances();
}
uint32 PCM::incrementNumInstances()
{
return MSR[0]->incrementNumInstances();
}
uint32 PCM::decrementNumInstances()
{
return MSR[0]->decrementNumInstances();;
}
int convertUnknownToInt(size_t size, char* value)
{
if(sizeof(int) == size)
{
return *(int*)value;
}
else if(sizeof(long) == size)
{
return *(long *)value;
}
else if(sizeof(long long) == size)
{
return *(long long *)value;
}
else
{
// In this case, we don't know what it is so we guess int
return *(int *)value;
}
}
#endif
bool PCM::decrementInstanceSemaphore()
{
if(allow_multiple_instances == false)
{
return programmed_pmu;
}
bool isLastInstance = false;
// when decrement was called before program() the numInstancesSemaphore
// may not be initialized, causing SIGSEGV. This fixes it.
if(numInstancesSemaphore == NULL)
return true;
#ifdef _MSC_VER
WaitForSingleObject(numInstancesSemaphore, 0);
DWORD res = WaitForSingleObject(numInstancesSemaphore, 0);
if (res == WAIT_TIMEOUT)
{
// I have the last instance of monitor
isLastInstance = true;
CloseHandle(numInstancesSemaphore);
}
else if (res == WAIT_OBJECT_0)
{
ReleaseSemaphore(numInstancesSemaphore, 1, NULL);
// std::cerr << "Someone else is running monitor instance, no cleanup needed"<< std::endl;
}
else
{
// unknown error
std::cerr << "ERROR: Bad semaphore. Performed cleanup twice?" << std::endl;
}
#elif __APPLE__
sem_wait(numInstancesSemaphore);
uint32 oldValue = PCM::getNumInstances();
sem_post(numInstancesSemaphore);
if(oldValue == 0)
{
// see same case for linux
return false;
}
sem_wait(numInstancesSemaphore);
uint32 currValue = PCM::decrementNumInstances();
sem_post(numInstancesSemaphore);
if(currValue == 0){
isLastInstance = true;
}
#else // if linux
int oldValue = -1;
sem_getvalue(numInstancesSemaphore, &oldValue);
if(oldValue == 0)
{
// the current value is already zero, somewhere the semaphore has been already decremented (and thus the clean up has been done if needed)
// that means logically we are do not own the last instance anymore, thus returning false
return false;
}
sem_wait(numInstancesSemaphore);
int curValue = -1;
sem_getvalue(numInstancesSemaphore, &curValue);
if (curValue == 0)
{
// I have the last instance of monitor
isLastInstance = true;
// std::cerr << "I am the last one"<< std::endl;
}
#endif // end ifdef _MSC_VER
return isLastInstance;
}
uint64 PCM::getTickCount(uint64 multiplier, uint32 core)
{
return (multiplier * getInvariantTSC(CoreCounterState(), getCoreCounterState(core))) / getNominalFrequency();
}
uint64 PCM::getTickCountRDTSCP(uint64 multiplier)
{
return (multiplier*RDTSCP())/getNominalFrequency();
}
SystemCounterState getSystemCounterState()
{
PCM * inst = PCM::getInstance();
SystemCounterState result;
if (inst) result = inst->getSystemCounterState();
return result;
}
SocketCounterState getSocketCounterState(uint32 socket)
{
PCM * inst = PCM::getInstance();
SocketCounterState result;
if (inst) result = inst->getSocketCounterState(socket);
return result;
}
CoreCounterState getCoreCounterState(uint32 core)
{
PCM * inst = PCM::getInstance();
CoreCounterState result;
if (inst) result = inst->getCoreCounterState(core);
return result;
}
#ifdef PCM_USE_PERF
void PCM::readPerfData(uint32 core, std::vector<uint64> & outData)
{
if(perfEventHandle[core][PERF_GROUP_LEADER_COUNTER] < 0)
{
std::fill(outData.begin(), outData.end(), 0);
return;
}
uint64 data[1 + PERF_MAX_COUNTERS];
const int32 bytes2read = sizeof(uint64)*(1 + core_fixed_counter_num_used + core_gen_counter_num_used);
int result = ::read(perfEventHandle[core][PERF_GROUP_LEADER_COUNTER], data, bytes2read );
// data layout: nr counters; counter 0, counter 1, counter 2,...
if(result != bytes2read)
{
std::cerr << "Error while reading perf data. Result is "<< result << std::endl;
std::cerr << "Check if you run other competing Linux perf clients." << std::endl;
} else if(data[0] != core_fixed_counter_num_used + core_gen_counter_num_used)
{
std::cerr << "Number of counters read from perf is wrong. Elements read: "<< data[0] << std::endl;
}
else
{ // copy all counters, they start from position 1 in data
std::copy((data + 1), (data + 1) + data[0], outData.begin());
}
}
#endif
void BasicCounterState::readAndAggregateTSC(std::shared_ptr<SafeMsrHandle> msr)
{
uint64 cInvariantTSC = 0;
PCM * m = PCM::getInstance();
uint32 cpu_model = m->getCPUModel();
if(cpu_model != PCM::ATOM || m->getOriginalCPUModel() == PCM::ATOM_AVOTON) msr->read(IA32_TIME_STAMP_COUNTER, &cInvariantTSC);
else
{
#ifdef _MSC_VER
cInvariantTSC = ((static_cast<uint64>(GetTickCount()/1000ULL)))*m->getNominalFrequency();
#else
struct timeval tp;
gettimeofday(&tp, NULL);
cInvariantTSC = (double(tp.tv_sec) + tp.tv_usec / 1000000.)*m->getNominalFrequency();
#endif
}
InvariantTSC += cInvariantTSC;
}
void BasicCounterState::readAndAggregate(std::shared_ptr<SafeMsrHandle> msr)
{
uint64 cInstRetiredAny = 0, cCpuClkUnhaltedThread = 0, cCpuClkUnhaltedRef = 0;
uint64 cL3Miss = 0;
uint64 cL3UnsharedHit = 0;
uint64 cL2HitM = 0;
uint64 cL2Hit = 0;
uint64 cL3Occupancy = 0;
uint64 cCStateResidency[PCM::MAX_C_STATE + 1];
memset(cCStateResidency, 0, sizeof(cCStateResidency));
uint64 thermStatus = 0;
uint64 cSMICount = 0;
const int32 core_id = msr->getCoreId();
TemporalThreadAffinity tempThreadAffinity(core_id); // speedup trick for Linux
PCM * m = PCM::getInstance();
const uint32 cpu_model = m->getCPUModel();
const int32 core_gen_counter_num_max = m->getMaxCustomCoreEvents();
const auto corruptedCountersMask = m->checkCustomCoreProgramming(msr);
// reading core PMU counters
#ifdef PCM_USE_PERF
if(m->canUsePerf)
{
std::vector<uint64> perfData(PERF_MAX_COUNTERS, 0ULL);
m->readPerfData(msr->getCoreId(), perfData);
cInstRetiredAny = perfData[PCM::PERF_INST_RETIRED_ANY_POS];
cCpuClkUnhaltedThread = perfData[PCM::PERF_CPU_CLK_UNHALTED_THREAD_POS];
cCpuClkUnhaltedRef = perfData[PCM::PERF_CPU_CLK_UNHALTED_REF_POS];
if (core_gen_counter_num_max > 0) cL3Miss = perfData[PCM::PERF_GEN_EVENT_0_POS];
if (core_gen_counter_num_max > 1) cL3UnsharedHit = perfData[PCM::PERF_GEN_EVENT_1_POS];
if (core_gen_counter_num_max > 2) cL2HitM = perfData[PCM::PERF_GEN_EVENT_2_POS];
if (core_gen_counter_num_max > 3) cL2Hit = perfData[PCM::PERF_GEN_EVENT_3_POS];
}
else
#endif
{
msr->read(INST_RETIRED_ANY_ADDR, &cInstRetiredAny);
msr->read(CPU_CLK_UNHALTED_THREAD_ADDR, &cCpuClkUnhaltedThread);
msr->read(CPU_CLK_UNHALTED_REF_ADDR, &cCpuClkUnhaltedRef);
switch (cpu_model)
{
case PCM::WESTMERE_EP:
case PCM::NEHALEM_EP:
case PCM::NEHALEM_EX:
case PCM::WESTMERE_EX:
case PCM::CLARKDALE:
case PCM::SANDY_BRIDGE:
case PCM::JAKETOWN:
case PCM::IVYTOWN:
case PCM::HASWELLX:
case PCM::BDX_DE:
case PCM::BDX:
case PCM::IVY_BRIDGE:
case PCM::HASWELL:
case PCM::BROADWELL:
case PCM::SKL:
case PCM::KBL:
case PCM::SKX:
if (core_gen_counter_num_max > 0) msr->read(IA32_PMC0, &cL3Miss);
if (core_gen_counter_num_max > 1) msr->read(IA32_PMC1, &cL3UnsharedHit);
if (core_gen_counter_num_max > 2) msr->read(IA32_PMC2, &cL2HitM);
if (core_gen_counter_num_max > 3) msr->read(IA32_PMC3, &cL2Hit);
break;
case PCM::ATOM:
case PCM::KNL:
if (core_gen_counter_num_max > 0) msr->read(IA32_PMC0, &cL3Miss); // for Atom mapped to ArchLLCMiss field
if (core_gen_counter_num_max > 1) msr->read(IA32_PMC1, &cL3UnsharedHit); // for Atom mapped to ArchLLCRef field
break;
}
}
if (corruptedCountersMask & 1) cL3Miss = ~0ULL;
if (corruptedCountersMask & 2) cL3UnsharedHit = ~0ULL;
if (corruptedCountersMask & 4) cL2HitM = ~0ULL;
if (corruptedCountersMask & 8) cL2Hit = ~0ULL;
// std::cout << "DEBUG1: "<< msr->getCoreId() << " " << cInstRetiredAny<< " "<< std::endl;
if(m->L3CacheOccupancyMetricAvailable())
{
msr->lock();
uint64 event = 1;
m->initQOSevent(event, core_id);
msr->read(IA32_QM_CTR,&cL3Occupancy);
//std::cout << "readAndAggregate reading IA32_QM_CTR "<< std::dec << cL3Occupancy << std::dec << std::endl;
msr->unlock();
}
m->readAndAggregateMemoryBWCounters(static_cast<uint32>(core_id), *this);
readAndAggregateTSC(msr);
// reading core C state counters
for(int i=0; i <= (int)(PCM::MAX_C_STATE) ;++i)
if(m->coreCStateMsr && m->coreCStateMsr[i])
msr->read(m->coreCStateMsr[i], &(cCStateResidency[i]));
// reading temperature
msr->read(MSR_IA32_THERM_STATUS, &thermStatus);
msr->read(MSR_SMI_COUNT, &cSMICount);
InstRetiredAny += m->extractCoreFixedCounterValue(cInstRetiredAny);
CpuClkUnhaltedThread += m->extractCoreFixedCounterValue(cCpuClkUnhaltedThread);
CpuClkUnhaltedRef += m->extractCoreFixedCounterValue(cCpuClkUnhaltedRef);
L3Miss += m->extractCoreGenCounterValue(cL3Miss);
L3UnsharedHit += m->extractCoreGenCounterValue(cL3UnsharedHit);
//std::cout << "Scaling Factor " << m->L3ScalingFactor;
cL3Occupancy = m->extractQOSMonitoring(cL3Occupancy);
L3Occupancy = (cL3Occupancy==PCM_INVALID_QOS_MONITORING_DATA)? PCM_INVALID_QOS_MONITORING_DATA : (uint64)((double)(cL3Occupancy * m->L3ScalingFactor) / 1024.0);
L2HitM += m->extractCoreGenCounterValue(cL2HitM);
L2Hit += m->extractCoreGenCounterValue(cL2Hit);
for(int i=0; i <= int(PCM::MAX_C_STATE);++i)
CStateResidency[i] += cCStateResidency[i];
ThermalHeadroom = extractThermalHeadroom(thermStatus);
SMICount += cSMICount;
}
PCM::ErrorCode PCM::programServerUncoreLatencyMetrics(bool enable_pmm)
{
uint32 DDRConfig[4] = {0,0,0,0};
if (enable_pmm == false)
{ //DDR is false
DDRConfig[0] = MC_CH_PCI_PMON_CTL_EVENT(0x80) + MC_CH_PCI_PMON_CTL_UMASK(0); // DRAM RPQ occupancy
DDRConfig[1] = MC_CH_PCI_PMON_CTL_EVENT(0x10) + MC_CH_PCI_PMON_CTL_UMASK(0); // DRAM RPQ Insert
DDRConfig[2] = MC_CH_PCI_PMON_CTL_EVENT(0x81) + MC_CH_PCI_PMON_CTL_UMASK(0); // DRAM WPQ Occupancy
DDRConfig[3] = MC_CH_PCI_PMON_CTL_EVENT(0x20) + MC_CH_PCI_PMON_CTL_UMASK(0); // DRAM WPQ Insert
} else {
DDRConfig[0] = MC_CH_PCI_PMON_CTL_EVENT(0xe0) + MC_CH_PCI_PMON_CTL_UMASK(1); // PMM RDQ occupancy
DDRConfig[1] = MC_CH_PCI_PMON_CTL_EVENT(0xe3) + MC_CH_PCI_PMON_CTL_UMASK(0); // PMM RDQ Insert
DDRConfig[2] = MC_CH_PCI_PMON_CTL_EVENT(0xe4) + MC_CH_PCI_PMON_CTL_UMASK(1); // PMM WPQ Occupancy
DDRConfig[3] = MC_CH_PCI_PMON_CTL_EVENT(0xe7) + MC_CH_PCI_PMON_CTL_UMASK(0); // PMM WPQ Insert
}
if (hasPCICFGUncore())
{
for (size_t i = 0; i < (size_t)server_pcicfg_uncore.size(); ++i)
{
server_pcicfg_uncore[i]->programIMC(DDRConfig);
}
}
return PCM::Success;
}
PCM::ErrorCode PCM::programServerUncoreMemoryMetrics(int rankA, int rankB, bool PMM)
{
if(MSR.empty() || server_pcicfg_uncore.empty()) return PCM::MSRAccessDenied;
for (int i = 0; (i < (int)server_pcicfg_uncore.size()) && MSR.size(); ++i)
{
server_pcicfg_uncore[i]->programServerUncoreMemoryMetrics(rankA, rankB, PMM);
}
return PCM::Success;
}
PCM::ErrorCode PCM::programServerUncorePowerMetrics(int mc_profile, int pcu_profile, int * freq_bands)
{
if(MSR.empty() || server_pcicfg_uncore.empty()) return PCM::MSRAccessDenied;
uint32 PCUCntConf[4] = {0,0,0,0};
PCUCntConf[0] = PCU_MSR_PMON_CTL_EVENT(0); // clock ticks
switch(pcu_profile)
{
case 0:
PCUCntConf[1] = PCU_MSR_PMON_CTL_EVENT(0xB); // FREQ_BAND0_CYCLES
PCUCntConf[2] = PCU_MSR_PMON_CTL_EVENT(0xC); // FREQ_BAND1_CYCLES
PCUCntConf[3] = PCU_MSR_PMON_CTL_EVENT(0xD); // FREQ_BAND2_CYCLES
break;
case 1:
PCUCntConf[1] = PCU_MSR_PMON_CTL_EVENT(0x80) + PCU_MSR_PMON_CTL_OCC_SEL(1); // POWER_STATE_OCCUPANCY.C0 using CLOCKTICKS + 8th-bit
PCUCntConf[2] = PCU_MSR_PMON_CTL_EVENT(0x80) + PCU_MSR_PMON_CTL_OCC_SEL(2); // POWER_STATE_OCCUPANCY.C3 using CLOCKTICKS + 8th-bit
PCUCntConf[3] = PCU_MSR_PMON_CTL_EVENT(0x80) + PCU_MSR_PMON_CTL_OCC_SEL(3); // POWER_STATE_OCCUPANCY.C6 using CLOCKTICKS + 8th-bit
break;
case 2:
PCUCntConf[1] = PCU_MSR_PMON_CTL_EVENT(0x09); // PROCHOT_INTERNAL_CYCLES
PCUCntConf[2] = PCU_MSR_PMON_CTL_EVENT(0x0A); // PROCHOT_EXTERNAL_CYCLES
PCUCntConf[3] = PCU_MSR_PMON_CTL_EVENT(0x04); // Thermal frequency limit cycles: FREQ_MAX_LIMIT_THERMAL_CYCLES
break;
case 3:
PCUCntConf[1] = PCU_MSR_PMON_CTL_EVENT(0x04); // Thermal frequency limit cycles: FREQ_MAX_LIMIT_THERMAL_CYCLES
PCUCntConf[2] = PCU_MSR_PMON_CTL_EVENT(0x05); // Power frequency limit cycles: FREQ_MAX_POWER_CYCLES
PCUCntConf[3] = PCU_MSR_PMON_CTL_EVENT(0x07); // Clipped frequency limit cycles: FREQ_MAX_CURRENT_CYCLES (not supported on SKX)
break;
case 4: // not supported on SKX
PCUCntConf[1] = PCU_MSR_PMON_CTL_EVENT(0x06); // OS frequency limit cycles: FREQ_MAX_OS_CYCLES
PCUCntConf[2] = PCU_MSR_PMON_CTL_EVENT(0x05); // Power frequency limit cycles: FREQ_MAX_POWER_CYCLES
PCUCntConf[3] = PCU_MSR_PMON_CTL_EVENT(0x07); // Clipped frequency limit cycles: FREQ_MAX_CURRENT_CYCLES
break;
case 5:
if(JAKETOWN == cpu_model)
{
PCUCntConf[1] = PCU_MSR_PMON_CTL_EVENT(0) + PCU_MSR_PMON_CTL_EXTRA_SEL + PCU_MSR_PMON_CTL_EDGE_DET ; // number of frequency transitions
PCUCntConf[2] = PCU_MSR_PMON_CTL_EVENT(0) + PCU_MSR_PMON_CTL_EXTRA_SEL ; // cycles spent changing frequency
} else if (IVYTOWN == cpu_model )
{
PCUCntConf[1] = PCU_MSR_PMON_CTL_EVENT(0x60) + PCU_MSR_PMON