Skip to content

Commit

Permalink
sched: highmem: Store local kmaps in task struct
Browse files Browse the repository at this point in the history
Instead of storing the map per CPU provide and use per task storage. That
prepares for local kmaps which are preemptible.

The context switch code is preparatory and not yet in use because
kmap_atomic() runs with preemption disabled. Will be made usable in the
next step.

The context switch logic is safe even when an interrupt happens after
clearing or before restoring the kmaps. The kmap index in task struct is
not modified so any nesting kmap in an interrupt will use unused indices
and on return the counter is the same as before.

Also add an assert into the return to user space code. Going back to user
space with an active kmap local is a nono.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/20201118204007.372935758@linutronix.de
  • Loading branch information
Thomas Gleixner committed Nov 24, 2020
1 parent 14df326 commit 5fbda3e
Show file tree
Hide file tree
Showing 6 changed files with 136 additions and 10 deletions.
10 changes: 10 additions & 0 deletions include/linux/highmem-internal.h
Expand Up @@ -9,6 +9,16 @@
void *__kmap_local_pfn_prot(unsigned long pfn, pgprot_t prot);
void *__kmap_local_page_prot(struct page *page, pgprot_t prot);
void kunmap_local_indexed(void *vaddr);
void kmap_local_fork(struct task_struct *tsk);
void __kmap_local_sched_out(void);
void __kmap_local_sched_in(void);
static inline void kmap_assert_nomap(void)
{
DEBUG_LOCKS_WARN_ON(current->kmap_ctrl.idx);
}
#else
static inline void kmap_local_fork(struct task_struct *tsk) { }
static inline void kmap_assert_nomap(void) { }
#endif

#ifdef CONFIG_HIGHMEM
Expand Down
9 changes: 9 additions & 0 deletions include/linux/sched.h
Expand Up @@ -34,6 +34,7 @@
#include <linux/rseq.h>
#include <linux/seqlock.h>
#include <linux/kcsan.h>
#include <asm/kmap_size.h>

/* task_struct member predeclarations (sorted alphabetically): */
struct audit_context;
Expand Down Expand Up @@ -629,6 +630,13 @@ struct wake_q_node {
struct wake_q_node *next;
};

struct kmap_ctrl {
#ifdef CONFIG_KMAP_LOCAL
int idx;
pte_t pteval[KM_MAX_IDX];
#endif
};

struct task_struct {
#ifdef CONFIG_THREAD_INFO_IN_TASK
/*
Expand Down Expand Up @@ -1294,6 +1302,7 @@ struct task_struct {
unsigned int sequential_io;
unsigned int sequential_io_avg;
#endif
struct kmap_ctrl kmap_ctrl;
#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
unsigned long task_state_change;
#endif
Expand Down
2 changes: 2 additions & 0 deletions kernel/entry/common.c
Expand Up @@ -2,6 +2,7 @@

#include <linux/context_tracking.h>
#include <linux/entry-common.h>
#include <linux/highmem.h>
#include <linux/livepatch.h>
#include <linux/audit.h>

Expand Down Expand Up @@ -194,6 +195,7 @@ static void exit_to_user_mode_prepare(struct pt_regs *regs)

/* Ensure that the address limit is intact and no locks are held */
addr_limit_user_check();
kmap_assert_nomap();
lockdep_assert_irqs_disabled();
lockdep_sys_exit();
}
Expand Down
1 change: 1 addition & 0 deletions kernel/fork.c
Expand Up @@ -930,6 +930,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
account_kernel_stack(tsk, 1);

kcov_task_init(tsk);
kmap_local_fork(tsk);

#ifdef CONFIG_FAULT_INJECTION
tsk->fail_nth = 0;
Expand Down
25 changes: 25 additions & 0 deletions kernel/sched/core.c
Expand Up @@ -4094,6 +4094,22 @@ static inline void finish_lock_switch(struct rq *rq)
# define finish_arch_post_lock_switch() do { } while (0)
#endif

static inline void kmap_local_sched_out(void)
{
#ifdef CONFIG_KMAP_LOCAL
if (unlikely(current->kmap_ctrl.idx))
__kmap_local_sched_out();
#endif
}

static inline void kmap_local_sched_in(void)
{
#ifdef CONFIG_KMAP_LOCAL
if (unlikely(current->kmap_ctrl.idx))
__kmap_local_sched_in();
#endif
}

/**
* prepare_task_switch - prepare to switch tasks
* @rq: the runqueue preparing to switch
Expand All @@ -4116,6 +4132,7 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev,
perf_event_task_sched_out(prev, next);
rseq_preempt(prev);
fire_sched_out_preempt_notifiers(prev, next);
kmap_local_sched_out();
prepare_task(next);
prepare_arch_switch(next);
}
Expand Down Expand Up @@ -4182,6 +4199,14 @@ static struct rq *finish_task_switch(struct task_struct *prev)
finish_lock_switch(rq);
finish_arch_post_lock_switch();
kcov_finish_switch(current);
/*
* kmap_local_sched_out() is invoked with rq::lock held and
* interrupts disabled. There is no requirement for that, but the
* sched out code does not have an interrupt enabled section.
* Restoring the maps on sched in does not require interrupts being
* disabled either.
*/
kmap_local_sched_in();

fire_sched_in_preempt_notifiers(current);
/*
Expand Down
99 changes: 89 additions & 10 deletions mm/highmem.c
Expand Up @@ -365,8 +365,6 @@ EXPORT_SYMBOL(kunmap_high);

#include <asm/kmap_size.h>

static DEFINE_PER_CPU(int, __kmap_local_idx);

/*
* With DEBUG_KMAP_LOCAL the stack depth is doubled and every second
* slot is unused which acts as a guard page
Expand All @@ -379,23 +377,21 @@ static DEFINE_PER_CPU(int, __kmap_local_idx);

static inline int kmap_local_idx_push(void)
{
int idx = __this_cpu_add_return(__kmap_local_idx, KM_INCR) - 1;

WARN_ON_ONCE(in_irq() && !irqs_disabled());
BUG_ON(idx >= KM_MAX_IDX);
return idx;
current->kmap_ctrl.idx += KM_INCR;
BUG_ON(current->kmap_ctrl.idx >= KM_MAX_IDX);
return current->kmap_ctrl.idx - 1;
}

static inline int kmap_local_idx(void)
{
return __this_cpu_read(__kmap_local_idx) - 1;
return current->kmap_ctrl.idx - 1;
}

static inline void kmap_local_idx_pop(void)
{
int idx = __this_cpu_sub_return(__kmap_local_idx, KM_INCR);

BUG_ON(idx < 0);
current->kmap_ctrl.idx -= KM_INCR;
BUG_ON(current->kmap_ctrl.idx < 0);
}

#ifndef arch_kmap_local_post_map
Expand Down Expand Up @@ -464,6 +460,7 @@ void *__kmap_local_pfn_prot(unsigned long pfn, pgprot_t prot)
pteval = pfn_pte(pfn, prot);
set_pte_at(&init_mm, vaddr, kmap_pte - idx, pteval);
arch_kmap_local_post_map(vaddr, pteval);
current->kmap_ctrl.pteval[kmap_local_idx()] = pteval;
preempt_enable();

return (void *)vaddr;
Expand Down Expand Up @@ -522,10 +519,92 @@ void kunmap_local_indexed(void *vaddr)
arch_kmap_local_pre_unmap(addr);
pte_clear(&init_mm, addr, kmap_pte - idx);
arch_kmap_local_post_unmap(addr);
current->kmap_ctrl.pteval[kmap_local_idx()] = __pte(0);
kmap_local_idx_pop();
preempt_enable();
}
EXPORT_SYMBOL(kunmap_local_indexed);

/*
* Invoked before switch_to(). This is safe even when during or after
* clearing the maps an interrupt which needs a kmap_local happens because
* the task::kmap_ctrl.idx is not modified by the unmapping code so a
* nested kmap_local will use the next unused index and restore the index
* on unmap. The already cleared kmaps of the outgoing task are irrelevant
* because the interrupt context does not know about them. The same applies
* when scheduling back in for an interrupt which happens before the
* restore is complete.
*/
void __kmap_local_sched_out(void)
{
struct task_struct *tsk = current;
pte_t *kmap_pte = kmap_get_pte();
int i;

/* Clear kmaps */
for (i = 0; i < tsk->kmap_ctrl.idx; i++) {
pte_t pteval = tsk->kmap_ctrl.pteval[i];
unsigned long addr;
int idx;

/* With debug all even slots are unmapped and act as guard */
if (IS_ENABLED(CONFIG_DEBUG_HIGHMEM) && !(i & 0x01)) {
WARN_ON_ONCE(!pte_none(pteval));
continue;
}
if (WARN_ON_ONCE(pte_none(pteval)))
continue;

/*
* This is a horrible hack for XTENSA to calculate the
* coloured PTE index. Uses the PFN encoded into the pteval
* and the map index calculation because the actual mapped
* virtual address is not stored in task::kmap_ctrl.
* For any sane architecture this is optimized out.
*/
idx = arch_kmap_local_map_idx(i, pte_pfn(pteval));

addr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
arch_kmap_local_pre_unmap(addr);
pte_clear(&init_mm, addr, kmap_pte - idx);
arch_kmap_local_post_unmap(addr);
}
}

void __kmap_local_sched_in(void)
{
struct task_struct *tsk = current;
pte_t *kmap_pte = kmap_get_pte();
int i;

/* Restore kmaps */
for (i = 0; i < tsk->kmap_ctrl.idx; i++) {
pte_t pteval = tsk->kmap_ctrl.pteval[i];
unsigned long addr;
int idx;

/* With debug all even slots are unmapped and act as guard */
if (IS_ENABLED(CONFIG_DEBUG_HIGHMEM) && !(i & 0x01)) {
WARN_ON_ONCE(!pte_none(pteval));
continue;
}
if (WARN_ON_ONCE(pte_none(pteval)))
continue;

/* See comment in __kmap_local_sched_out() */
idx = arch_kmap_local_map_idx(i, pte_pfn(pteval));
addr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
set_pte_at(&init_mm, addr, kmap_pte - idx, pteval);
arch_kmap_local_post_map(addr, pteval);
}
}

void kmap_local_fork(struct task_struct *tsk)
{
if (WARN_ON_ONCE(tsk->kmap_ctrl.idx))
memset(&tsk->kmap_ctrl, 0, sizeof(tsk->kmap_ctrl));
}

#endif

#if defined(HASHED_PAGE_VIRTUAL)
Expand Down

0 comments on commit 5fbda3e

Please sign in to comment.