Skip to content

Commit 83cd4fe

Browse files
Venkatesh PallipadiIngo Molnar
authored andcommitted
sched: Change nohz idle load balancing logic to push model
In the new push model, all idle CPUs indeed go into nohz mode. There is still the concept of idle load balancer (performing the load balancing on behalf of all the idle cpu's in the system). Busy CPU kicks the nohz balancer when any of the nohz CPUs need idle load balancing. The kickee CPU does the idle load balancing on behalf of all idle CPUs instead of the normal idle balance. This addresses the below two problems with the current nohz ilb logic: * the idle load balancer continued to have periodic ticks during idle and wokeup frequently, even though it did not have any rebalancing to do on behalf of any of the idle CPUs. * On x86 and CPUs that have APIC timer stoppage on idle CPUs, this periodic wakeup can result in a periodic additional interrupt on a CPU doing the timer broadcast. Also currently we are migrating the unpinned timers from an idle to the cpu doing idle load balancing (when all the cpus in the system are idle, there is no idle load balancing cpu and timers get added to the same idle cpu where the request was made. So the existing optimization works only on semi idle system). And In semi idle system, we no longer have periodic ticks on the idle load balancer CPU. Using that cpu will add more delays to the timers than intended (as that cpu's timer base may not be uptodate wrt jiffies etc). This was causing mysterious slowdowns during boot etc. For now, in the semi idle case, use the nearest busy cpu for migrating timers from an idle cpu. This is good for power-savings anyway. Signed-off-by: Venkatesh Pallipadi <venki@google.com> Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com> Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Thomas Gleixner <tglx@linutronix.de> LKML-Reference: <1274486981.2840.46.camel@sbs-t61.sc.intel.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
1 parent fdf3e95 commit 83cd4fe

File tree

6 files changed

+237
-159
lines changed

6 files changed

+237
-159
lines changed

include/linux/sched.h

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -271,14 +271,11 @@ extern int runqueue_is_locked(int cpu);
271271

272272
extern cpumask_var_t nohz_cpu_mask;
273273
#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ)
274-
extern int select_nohz_load_balancer(int cpu);
275-
extern int get_nohz_load_balancer(void);
274+
extern void select_nohz_load_balancer(int stop_tick);
275+
extern int get_nohz_timer_target(void);
276276
extern int nohz_ratelimit(int cpu);
277277
#else
278-
static inline int select_nohz_load_balancer(int cpu)
279-
{
280-
return 0;
281-
}
278+
static inline void select_nohz_load_balancer(int stop_tick) { }
282279

283280
static inline int nohz_ratelimit(int cpu)
284281
{

kernel/hrtimer.c

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -144,12 +144,8 @@ struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer,
144144
static int hrtimer_get_target(int this_cpu, int pinned)
145145
{
146146
#ifdef CONFIG_NO_HZ
147-
if (!pinned && get_sysctl_timer_migration() && idle_cpu(this_cpu)) {
148-
int preferred_cpu = get_nohz_load_balancer();
149-
150-
if (preferred_cpu >= 0)
151-
return preferred_cpu;
152-
}
147+
if (!pinned && get_sysctl_timer_migration() && idle_cpu(this_cpu))
148+
return get_nohz_timer_target();
153149
#endif
154150
return this_cpu;
155151
}

kernel/sched.c

Lines changed: 31 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -460,7 +460,7 @@ struct rq {
460460
unsigned long last_load_update_tick;
461461
#ifdef CONFIG_NO_HZ
462462
u64 nohz_stamp;
463-
unsigned char in_nohz_recently;
463+
unsigned char nohz_balance_kick;
464464
#endif
465465
unsigned int skip_clock_update;
466466

@@ -1194,6 +1194,27 @@ static void resched_cpu(int cpu)
11941194
}
11951195

11961196
#ifdef CONFIG_NO_HZ
1197+
/*
1198+
* In the semi idle case, use the nearest busy cpu for migrating timers
1199+
* from an idle cpu. This is good for power-savings.
1200+
*
1201+
* We don't do similar optimization for completely idle system, as
1202+
* selecting an idle cpu will add more delays to the timers than intended
1203+
* (as that cpu's timer base may not be uptodate wrt jiffies etc).
1204+
*/
1205+
int get_nohz_timer_target(void)
1206+
{
1207+
int cpu = smp_processor_id();
1208+
int i;
1209+
struct sched_domain *sd;
1210+
1211+
for_each_domain(cpu, sd) {
1212+
for_each_cpu(i, sched_domain_span(sd))
1213+
if (!idle_cpu(i))
1214+
return i;
1215+
}
1216+
return cpu;
1217+
}
11971218
/*
11981219
* When add_timer_on() enqueues a timer into the timer wheel of an
11991220
* idle CPU then this timer might expire before the next timer event
@@ -7791,6 +7812,10 @@ void __init sched_init(void)
77917812
rq->idle_stamp = 0;
77927813
rq->avg_idle = 2*sysctl_sched_migration_cost;
77937814
rq_attach_root(rq, &def_root_domain);
7815+
#ifdef CONFIG_NO_HZ
7816+
rq->nohz_balance_kick = 0;
7817+
init_sched_softirq_csd(&per_cpu(remote_sched_softirq_cb, i));
7818+
#endif
77947819
#endif
77957820
init_rq_hrtick(rq);
77967821
atomic_set(&rq->nr_iowait, 0);
@@ -7835,8 +7860,11 @@ void __init sched_init(void)
78357860
zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT);
78367861
#ifdef CONFIG_SMP
78377862
#ifdef CONFIG_NO_HZ
7838-
zalloc_cpumask_var(&nohz.cpu_mask, GFP_NOWAIT);
7839-
alloc_cpumask_var(&nohz.ilb_grp_nohz_mask, GFP_NOWAIT);
7863+
zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
7864+
alloc_cpumask_var(&nohz.grp_idle_mask, GFP_NOWAIT);
7865+
atomic_set(&nohz.load_balancer, nr_cpu_ids);
7866+
atomic_set(&nohz.first_pick_cpu, nr_cpu_ids);
7867+
atomic_set(&nohz.second_pick_cpu, nr_cpu_ids);
78407868
#endif
78417869
/* May be allocated at isolcpus cmdline parse time */
78427870
if (cpu_isolated_map == NULL)

0 commit comments

Comments
 (0)