Skip to content
Permalink
Browse files
sysfs: add /sys/kernel/mm/numa/demotion_list
Current demotion algorithm was designed with DRAM->PMEM case in mind.
This is not the only possible path for migration. In general case,
it's hard to figure out all possible scenarios.

It would be interesting to try different demotion paths depending on
workload at run-tume. Imagine a hosting buseness and a configuration
with one fast CPU node, one slow CPU node, one fast pmem and one slow
pmem. Depending on desired performance characteristics, the demotion
paths may look like this:
        1. fast cpu -> fast pmem -> swap
        2. slow cpu -> slow pmem -> swap

Or like this:
        1. fast cpu -> fast pmem -> slow pmem -> swap
        2. slow cpu -> slow pmem -> swap

Or like this:
        1. fast cpu -> fast pmem -> slow pmem -> swap
        2. slow cpu -> swap

Or like this:
        fast cpu -> slow cpu -> fast pmem -> slow pmem -> swap

This is clearly a tradeoff depending on desired performance. It may
get changed next morning, and it will require node chain adjustments,
preferably without stopping the machine and rebuilding the kernel.

The most robust solution I can figure out is an hierarchy of demotion
settings:

1. Device tree record;
2. cmdline parameter;
3. Current automatic approach with the absence of 1 and 2;
4. Direct access to demotion chain using sysfs.

This patch implements (4).

Another interesting option is to use demotion_list to allocate memory
in case of MPOL_PREFERRED{_MANY}. It would be great to discuss all
alternatives.

Signed-off-by: Yury Norov <yury.norov@gmail.com>
  • Loading branch information
YuryNorov committed Aug 15, 2021
1 parent 738d882 commit ada82b5573a980b7c5a7aeae488ea6c7213e6429
Show file tree
Hide file tree
Showing 3 changed files with 101 additions and 1 deletion.
@@ -40,6 +40,8 @@ enum migrate_demotion_flag {
/* In mm/debug.c; also keep sync with include/trace/events/migrate.h */
extern const char *migrate_reason_names[MR_TYPES];

extern int node_demotion[MAX_NUMNODES];

#ifdef CONFIG_MIGRATION

#ifndef CONFIG_64BIT
@@ -229,6 +231,7 @@ int migrate_vma_setup(struct migrate_vma *args);
void migrate_vma_pages(struct migrate_vma *migrate);
void migrate_vma_finalize(struct migrate_vma *migrate);
int next_demotion_node(int node);
int set_demotion_target(int node, int target);

#else /* CONFIG_MIGRATION disabled: */

@@ -237,6 +240,11 @@ static inline int next_demotion_node(int node)
return NUMA_NO_NODE;
}

static int set_demotion_target(int node, int target)
{
return -ENOSYS;
}

#endif /* CONFIG_MIGRATION */

#endif /* _LINUX_MIGRATE_H */
@@ -3034,8 +3034,60 @@ static struct kobj_attribute numa_demotion_enabled_attr =
__ATTR(demotion_enabled, 0644, numa_demotion_enabled_show,
numa_demotion_enabled_store);

static ssize_t numa_demotion_list_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
int len, n, pos = 0;

for_each_online_node(n) {
len = sysfs_emit_at(buf, pos, "%d %d\n", n, node_demotion[n]);
if (len > 0)
pos += len;
}

return pos;
}

static ssize_t numa_demotion_list_store(struct kobject *kobj,
struct kobj_attribute *attr,
const char *buf, size_t count)
{
nodemask_t nodes;
char *nodelist;
int from, to, ret;

nodelist = strnchr(buf, count, '>');
if (!nodelist)
return -EINVAL;

*nodelist++ = 0;

ret = kstrtoint(buf, 0, &to);
if (ret)
return ret;

ret = nodelist_parse(nodelist, nodes);
if (ret)
return ret;

for_each_node_mask(from, nodes) {
ret = set_demotion_target(from, to);
if (ret == -EXDEV)
pr_warn("Cross-node loop for demotion: %d>%d\n", to, from);
else if (ret)
return ret;
}

return count;
}

static struct kobj_attribute numa_demotion_list_attr =
__ATTR(demotion_list, 0644, numa_demotion_list_show,
numa_demotion_list_store);

static struct attribute *numa_attrs[] = {
&numa_demotion_enabled_attr.attr,
&numa_demotion_list_attr.attr,
NULL,
};

@@ -1153,7 +1153,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
* must be held over all reads to ensure that no cycles are
* observed.
*/
static int node_demotion[MAX_NUMNODES] __read_mostly =
int node_demotion[MAX_NUMNODES] __read_mostly =
{[0 ... MAX_NUMNODES - 1] = NUMA_NO_NODE};

/**
@@ -3304,3 +3304,43 @@ static int __init migrate_on_reclaim_init(void)
}
late_initcall(migrate_on_reclaim_init);
#endif /* CONFIG_MEMORY_HOTPLUG */

static void __set_demotion_target(const int node, const int target)
{
get_online_mems();
node_demotion[node] = target;
synchronize_rcu();
put_online_mems();
}

int set_demotion_target(const int node, const int target)
{
nodemask_t demotion_path;
int next, cnt;

if (target < NUMA_NO_NODE || target >= MAX_NUMNODES)
return -ERANGE;

if (target != NUMA_NO_NODE && !node_online(target))
return -ENXIO;

/* Ignore offline nodes. */
if (!node_online(node))
return 0;

nodes_clear(demotion_path);
node_set(node, demotion_path);

for (next = target, cnt = 0; next != -1 && cnt < MAX_NUMNODES;
next = next_demotion_node(next), cnt++) {
if (node_test_and_set(next, demotion_path))
/* Demotion path has cross-links (loops). */
return -EXDEV;
}

/* nodes_demotion[] broken? */
WARN_ON(cnt >= MAX_NUMNODES);
__set_demotion_target(node, target);

return 0;
}

0 comments on commit ada82b5

Please sign in to comment.