Skip to content

Commit

Permalink
darwin: detect heterogeneous caches from sysctl perflevels
Browse files Browse the repository at this point in the history
When we get cpukinds from IOKit, we have to match them
with perflevels (P=0 and E=1 for now) to find out which
cpukinds correspond to sysctl perflevel caches.

If we can't match (may occur if future core types are
different from E and P), we fallback to sysctl perflevels.

Fixes #507

Signed-off-by: Brice Goglin <Brice.Goglin@inria.fr>
  • Loading branch information
bgoglin committed Jan 13, 2022
1 parent 565c241 commit f7c9aa8
Show file tree
Hide file tree
Showing 2 changed files with 166 additions and 6 deletions.
2 changes: 2 additions & 0 deletions NEWS
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@ Version 2.8.0
+ Add HWLOC_TOPOLOGY_FLAG_NO_DISTANCES, _NO_MEMATTRS and _NO_CPUKINDS
to reduce the overhead when unneeded.
* Backends
+ Detect heterogeneous caches in hybrid CPUs on MacOS X,
thanks to Paul Bone for the help.
+ Max frequencies are not ignored on Linux anymore (as in hwloc 2.7.0),
but they may be slightly adjusted to avoid reporting hybrid CPUs
because of features such as Intel Turbo Boost Max 3.0.
Expand Down
170 changes: 164 additions & 6 deletions hwloc/topology-darwin.c
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,8 @@ static void hwloc__darwin_cpukinds_add_cpu(struct hwloc_darwin_cpukinds *kinds,

#define DT_PLANE "IODeviceTree"

static int hwloc__darwin_look_iokit_cpukinds(struct hwloc_darwin_cpukinds *kinds)
static int hwloc__darwin_look_iokit_cpukinds(struct hwloc_darwin_cpukinds *kinds,
int *matched_perflevels)
{
io_registry_entry_t cpus_root;
io_iterator_t cpus_iter;
Expand Down Expand Up @@ -288,6 +289,7 @@ static int hwloc__darwin_look_iokit_cpukinds(struct hwloc_darwin_cpukinds *kinds
* perflevel0 always refers to the highest performance core type in the system.
* if we only have E and P, E=perflevel1, P=perflevel0, otherwise we don't know.
*/
*matched_perflevels = 1;
for(i=0; i<kinds->nr; i++) {
/*
* cluster types: https://developer.apple.com/news/?id=vk3m204o
Expand All @@ -298,6 +300,7 @@ static int hwloc__darwin_look_iokit_cpukinds(struct hwloc_darwin_cpukinds *kinds
} else if (kinds->kinds[1].cluster_type == 'P') {
kinds->kinds[i].perflevel = 0;
} else {
*matched_perflevels = 0;
if (!hwloc_hide_errors())
fprintf(stderr, "hwloc/darwin/cpukinds: unrecognized cluster type %c compatible %s, cannot match perflevels\n",
kinds->kinds[i].cluster_type, kinds->kinds[i].compatible);
Expand Down Expand Up @@ -399,6 +402,135 @@ static void hwloc__darwin_build_cache_level(struct hwloc_topology *topology,
}
}

static void hwloc__darwin_build_perflevel_cache_level(struct hwloc_topology *topology,
hwloc_bitmap_t cpuset, unsigned width,
hwloc_obj_type_t type,
unsigned depth, uint64_t size, int64_t linesize)
{
unsigned j,k;
int next = -1;
hwloc_debug_2args_bitmap("looking at perflevel cache depth %u width %u inside cpuset %s\n", depth, width, cpuset);
for (j = 0;; j++) {
hwloc_bitmap_t objcpuset;
hwloc_obj_t obj;

objcpuset = hwloc_bitmap_alloc();
for(k=0; k<width; k++) {
next = hwloc_bitmap_next(cpuset, next);
if (next < 0)
break;
hwloc_bitmap_set(objcpuset, next);
}
if (!k) {
/* failed to find any remaining cpu in cpuset, stop */
hwloc_bitmap_free(objcpuset);
return;
}

obj = hwloc_alloc_setup_object(topology, type, HWLOC_UNKNOWN_INDEX);
obj->cpuset = objcpuset;

hwloc_debug_2args_bitmap("L%ucache %u has cpuset %s\n",
depth, j, obj->cpuset);
obj->attr->cache.depth = depth;
obj->attr->cache.type = depth > 1 ? HWLOC_OBJ_CACHE_UNIFIED
: hwloc_obj_type_is_icache(type) ? HWLOC_OBJ_CACHE_INSTRUCTION : HWLOC_OBJ_CACHE_DATA;
obj->attr->cache.size = size;
obj->attr->cache.linesize = linesize;
obj->attr->cache.associativity = 0; /* unknown */
hwloc__insert_object_by_cpuset(topology, NULL, obj, "darwin:perflevel:cache");
}
}

struct gothybrid {
unsigned l1i, l1d, l2, l3;
};

static void hwloc__darwin_look_perflevel_caches(struct hwloc_topology *topology,
unsigned level,
hwloc_bitmap_t cpuset,
int64_t linesize,
struct gothybrid *gothybrid)
{
char name[64];
int64_t size;

snprintf(name, sizeof(name), "hw.perflevel%u.l1icachesize", level);
if (!hwloc_get_sysctlbyname(name, &size)) {
/* hw.perflevel%u.cpusperl1i missing, assume it's per PU */
hwloc_debug("found perflevel %u l1icachesize %ld, assuming width 1\n", level, (long) size);
hwloc__darwin_build_perflevel_cache_level(topology, cpuset, 1, HWLOC_OBJ_L1ICACHE, 1, size, linesize);
gothybrid->l1i++;
}

snprintf(name, sizeof(name), "hw.perflevel%u.l1dcachesize", level);
if (!hwloc_get_sysctlbyname(name, &size)) {
/* hw.perflevel%u.cpusperl1d missing, assume it's per PU */
hwloc_debug("found perflevel %u l1dcachesize %ld, assuming width 1\n", level, (long) size);
hwloc__darwin_build_perflevel_cache_level(topology, cpuset, 1, HWLOC_OBJ_L1CACHE, 1, size, linesize);
gothybrid->l1d++;
}

snprintf(name, sizeof(name), "hw.perflevel%u.l2cachesize", level);
if (!hwloc_get_sysctlbyname(name, &size)) {
int64_t cpus;

hwloc_debug("found perflevel %u l2cachesize %ld\n", level, (long) size);

snprintf(name, sizeof(name), "hw.perflevel%u.cpusperl2", level);
if (hwloc_get_sysctlbyname(name, &cpus)) {
hwloc_debug("couldn't find perflevel %u cpusperl2, assuming width 1\n", level);
cpus = 1;
} else {
hwloc_debug("found perflevel %u cpusperl2 %ld\n", level, (long) cpus);
}

{
/* hw.perflevelN.l2perflevels = "bitmap, where bit number of CPUs of the same type that share L2",
* not available yet, warn if we see one.
*/
size_t s;
snprintf(name, sizeof(name), "hw.perflevel%u.l2perflevels", level);
if (!sysctlbyname(name, NULL, &s, NULL, 0))
if (!hwloc_hide_errors())
fprintf(stderr, "hwloc/darwin: key %s succeeded size %lu, please report to hwloc developers.\n", name, s);
}

/* assume PUs are contigous for now. */
hwloc__darwin_build_perflevel_cache_level(topology, cpuset, cpus, HWLOC_OBJ_L2CACHE, 2, size, linesize);
gothybrid->l2++;
}

snprintf(name, sizeof(name), "hw.perflevel%u.l3cachesize", level);
if (!hwloc_get_sysctlbyname(name, &size)) {
int64_t cpus;

hwloc_debug("found perflevel %u l3cachesize %ld\n", level, (long) size);

snprintf(name, sizeof(name), "hw.perflevel%u.cpusperl3", level);
if (hwloc_get_sysctlbyname(name, &cpus)) {
hwloc_debug("couldn't find perflevel %u cpusperl3, assuming width 1\n", level);
cpus = 1;
} else {
hwloc_debug("found perflevel %u cpusperl3 %ld\n", level, (long) cpus);
}

{
/* hw.perflevelN.l3perflevels = "bitmap, where bit number of CPUs of the same type that share L2",
* not available yet, warn if we see one.
*/
size_t s;
snprintf(name, sizeof(name), "hw.perflevel%u.l3perflevels", level);
if (!sysctlbyname(name, NULL, &s, NULL, 0))
if (!hwloc_hide_errors())
fprintf(stderr, "hwloc/darwin: key %s succeeded size %lu, please report to hwloc developers.\n", name, s);
}

hwloc__darwin_build_perflevel_cache_level(topology, cpuset, cpus, HWLOC_OBJ_L3CACHE, 3, size, linesize);
gothybrid->l3++;
}
}

static int
hwloc_look_darwin(struct hwloc_backend *backend, struct hwloc_disc_status *dstatus)
{
Expand Down Expand Up @@ -427,11 +559,14 @@ hwloc_look_darwin(struct hwloc_backend *backend, struct hwloc_disc_status *dstat
char cpufamilynumber[20], cpumodelnumber[20], cpustepping[20];
int gotnuma = 0;
int gotnumamemory = 0;
struct gothybrid gothybrid;
int64_t nperflevels = 0;
struct hwloc_darwin_cpukinds kinds;
int cpukinds_from_sysctl;
char *env;

memset(&gothybrid, 0, sizeof(gothybrid));

assert(dstatus->phase == HWLOC_DISC_PHASE_CPU);

if (topology->levels[0][0]->cpuset)
Expand All @@ -456,8 +591,20 @@ hwloc_look_darwin(struct hwloc_backend *backend, struct hwloc_disc_status *dstat
* but that's why we use IOKit first.
*/
#if (defined HWLOC_HAVE_DARWIN_FOUNDATION) && (defined HWLOC_HAVE_DARWIN_IOKIT)
if (cpukinds_from_sysctl != 1)
hwloc__darwin_look_iokit_cpukinds(&kinds);
if (cpukinds_from_sysctl != 1) {
int matched_perflevels = 0;
hwloc__darwin_look_iokit_cpukinds(&kinds, &matched_perflevels);
hwloc_debug("Found %u kinds with matched=%u in IOKit for %ld sysctl perflevels\n",
kinds.nr, matched_perflevels, (long) nperflevels);
if (nperflevels > 0 && cpukinds_from_sysctl == -1 && (!matched_perflevels || nperflevels != kinds.nr)) {
/* sysctl has perflevel info, but we couldn't rank iokit kinds accordingly,
* it means we won't be able to find out which perflevel caches correspond to which cpukind.
* remove iokit cpukinds and fallback to sysctl cpukinds.
*/
hwloc_debug("Clearing IOKit cpukinds to read them from sysctl\n");
hwloc__darwin_cpukinds_clear(&kinds);
}
}
#endif

if (nperflevels > 0 && (cpukinds_from_sysctl == 1 || (cpukinds_from_sysctl == -1 && !kinds.nr))) {
Expand Down Expand Up @@ -626,6 +773,11 @@ hwloc_look_darwin(struct hwloc_backend *backend, struct hwloc_disc_status *dstat
hwloc_obj_add_info(topology->levels[0][0], "CPUStepping", cpustepping);
}

hwloc_debug("%s", "\nReading caches from sysctl perflevels\n");
for(i=0; i<kinds.nr; i++)
if (kinds.kinds[i].perflevel >= 0)
hwloc__darwin_look_perflevel_caches(topology, kinds.kinds[i].perflevel, kinds.kinds[i].cpuset, cachelinesize, &gothybrid);

if (hwloc_get_sysctlbyname("hw.l1dcachesize", &l1dcachesize))
l1dcachesize = 0;

Expand Down Expand Up @@ -689,28 +841,34 @@ hwloc_look_darwin(struct hwloc_backend *backend, struct hwloc_disc_status *dstat
cachesize[3] = l3cachesize;
}

hwloc_debug("%s", "caches");
hwloc_debug("%s", "non-hybrid caches");
for (i = 0; i < n && cacheconfig[i]; i++)
hwloc_debug(" %"PRIu64"(%"PRIu64"kB)", cacheconfig[i], cachesize[i] / 1024);

/* Now we know how many caches there are */
n = i;
hwloc_debug("\n%u cache levels\n", n - 1);
hwloc_debug("\n%u non-hybrid cache levels\n", n - 1);

/* slot 0 is memory */
hwloc__darwin_build_numa_level(topology, nprocs / cacheconfig[0], cacheconfig[0], cachesize[0],
&gotnuma, &gotnumamemory);
/* other slots are caches L1d, L2u, L3u, etc.
* L1i doesn't have an explicit slot but we can assume things are shared similarly to L1d in first slot.
*/
if (n>=2 && l1icachesize
if (!gothybrid.l1i && n>=2 && l1icachesize
&& hwloc_filter_check_keep_object_type(topology, HWLOC_OBJ_L1ICACHE)) {
hwloc__darwin_build_cache_level(topology, nprocs / cacheconfig[1], cacheconfig[1],
HWLOC_OBJ_L1ICACHE,
1, l1icachesize, cachelinesize, 0 /* unknown */);
}
/* Use slots for L1d, L2u, L3u now. */
for (i = 1; i < n; i++) {
if (i == 1 && gothybrid.l1d)
continue;
if (i == 2 && gothybrid.l2)
continue;
if (i == 3 && gothybrid.l3)
continue;
if (hwloc_filter_check_keep_object_type(topology, HWLOC_OBJ_L1CACHE+i-1)) {
hwloc__darwin_build_cache_level(topology, nprocs / cacheconfig[i], cacheconfig[i],
HWLOC_OBJ_L1CACHE+i-1,
Expand Down

0 comments on commit f7c9aa8

Please sign in to comment.