Skip to content
Permalink
Browse files
sysfs: add /sys/kernel/mm/numa/demotion_list
Current demotion algorithm was designed with DRAM->PMEM case in mind.
This is not the only possible path for migration. In case of NVIDIA,
we would like to migrate GPU memory to DRAM. However we admit that in
general case, it's hard to figure out all possible scenarios.

It would be interesting to try different demotion paths depending on
workload at run-tume. We consider at least gpu->dram->pmem and
gpu->pmem for testing.

The most robust solution I can figure out is an hierarchy of demotion
settings:

1. Device tree record;
2. cmdline parameter;
3. Current automatic approach with the absence of 1 and 2;
4. Direct access to demotion chain using sysfs.

This patch implements (4).

Another interesting option is to use demotion_list to allocate memory
in case of MPOL_PREFERRED{_MANY}.

It would be great to discuss all alternatives.

Signed-off-by: Yury Norov <yury.norov@gmail.com>
  • Loading branch information
YuryNorov committed Aug 11, 2021
1 parent 922875e commit f2d905fe922bde5baddb485d6152a431b7954e50
Show file tree
Hide file tree
Showing 3 changed files with 101 additions and 1 deletion.
@@ -40,6 +40,8 @@ enum migrate_demotion_flag {
/* In mm/debug.c; also keep sync with include/trace/events/migrate.h */
extern const char *migrate_reason_names[MR_TYPES];

extern int node_demotion[MAX_NUMNODES];

#ifdef CONFIG_MIGRATION

#ifndef CONFIG_64BIT
@@ -229,6 +231,7 @@ int migrate_vma_setup(struct migrate_vma *args);
void migrate_vma_pages(struct migrate_vma *migrate);
void migrate_vma_finalize(struct migrate_vma *migrate);
int next_demotion_node(int node);
int set_demotion_target(int node, int target);

#else /* CONFIG_MIGRATION disabled: */

@@ -237,6 +240,11 @@ static inline int next_demotion_node(int node)
return NUMA_NO_NODE;
}

static int set_demotion_target(int node, int target)
{
return -ENOSYS;
}

#endif /* CONFIG_MIGRATION */

#endif /* _LINUX_MIGRATE_H */
@@ -3034,8 +3034,60 @@ static struct kobj_attribute numa_demotion_enabled_attr =
__ATTR(demotion_enabled, 0644, numa_demotion_enabled_show,
numa_demotion_enabled_store);

static ssize_t numa_demotion_list_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
int len, n, pos = 0;

for_each_online_node(n) {
len = sysfs_emit_at(buf, pos, "%d %d\n", n, node_demotion[n]);
if (len > 0)
pos += len;
}

return pos;
}

static ssize_t numa_demotion_list_store(struct kobject *kobj,
struct kobj_attribute *attr,
const char *buf, size_t count)
{
nodemask_t nodes;
char *nodelist;
int from, to, ret;

nodelist = strnchr(buf, count, '>');
if (!nodelist)
return -EINVAL;

*nodelist++ = 0;

ret = kstrtoint(buf, 0, &to);
if (ret)
return ret;

ret = nodelist_parse(nodelist, nodes);
if (ret)
return ret;

for_each_node_mask(from, nodes) {
ret = set_demotion_target(from, to);
if (ret == -EXDEV)
pr_warn("Cross-node loop for demotion: %d>%d\n", to, from);
else if (ret)
return ret;
}

return count;
}

static struct kobj_attribute numa_demotion_list_attr =
__ATTR(demotion_list, 0644, numa_demotion_list_show,
numa_demotion_list_store);

static struct attribute *numa_attrs[] = {
&numa_demotion_enabled_attr.attr,
&numa_demotion_list_attr.attr,
NULL,
};

@@ -1153,7 +1153,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
* must be held over all reads to ensure that no cycles are
* observed.
*/
static int node_demotion[MAX_NUMNODES] __read_mostly =
int node_demotion[MAX_NUMNODES] __read_mostly =
{[0 ... MAX_NUMNODES - 1] = NUMA_NO_NODE};

/**
@@ -3218,6 +3218,46 @@ static void set_migration_target_nodes(void)
put_online_mems();
}

static void __set_demotion_target(const int node, const int target)
{
get_online_mems();
node_demotion[node] = target;
synchronize_rcu();
put_online_mems();
}

int set_demotion_target(const int node, const int target)
{
nodemask_t demotion_path;
int next, cnt;

if (target < NUMA_NO_NODE || target >= MAX_NUMNODES)
return -ERANGE;

if (target != NUMA_NO_NODE && !node_online(target))
return -ENXIO;

/* Ignore offline nodes. */
if (!node_online(node))
return 0;

nodes_clear(demotion_path);
node_set(node, demotion_path);

for (next = target, cnt = 0; next != -1 && cnt < MAX_NUMNODES;
next = next_demotion_node(next), cnt++) {
if (node_test_and_set(next, demotion_path))
/* Demotion path has cross-links (loops). */
return -EXDEV;
}

/* nodes_demotion[] broken? */
WARN_ON(cnt >= MAX_NUMNODES);
__set_demotion_target(node, target);

return 0;
}

/*
* React to hotplug events that might affect the migration targets
* like events that online or offline NUMA nodes.

0 comments on commit f2d905f

Please sign in to comment.