-
Notifications
You must be signed in to change notification settings - Fork 1.7k
/
optional-operators-configure-realtime-tuned-profile-commands.sh
executable file
·133 lines (112 loc) · 4.16 KB
/
optional-operators-configure-realtime-tuned-profile-commands.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
#!/bin/bash
set -o nounset
set -o errexit
set -o pipefail
node_role=${APPLY_NODE_ROLE:=worker}
max_cpu=8
isolated_cpu=${COMPUTE_NODE_ISOLATED_CPU:-4}
sched_rt_runtime_us=-1
stalld_service="service.stalld=start,enable"
gcp_pattern="[n|c|m|a]{1}[1-9]{1}d?-(standard|highcpu|highmem|highgpu){1}-([0-9]+)"
# Currently RT is only supported on GCP
# if non supported node type provided, will default to 4 isolated and 8 max CPU usage
if [[ ${COMPUTE_NODE_TYPE-"not_provided"} =~ $gcp_pattern ]]
then
max_cpu=${BASH_REMATCH[2]}
echo "Using compute node $COMPUTE_NODE_TYPE, setting max vCPU to $max_cpu"
else
echo "No supported compute node detected, using default max vCPU of $max_cpu"
fi
if [[ "$isolated_cpu" == "$max_cpu" ]]; then
isolated_cpu=$(( max_cpu / 2))
echo "max and isolated cpu are equal, setting isolated CPU to $isolated_cpu"
fi
if [ ${STALLD_ENABLED:="true"} != "true" ]
then
echo "disabling stalld and setting default realtime timeout"
sched_rt_runtime_us=950000
stalld_service=""
fi
echo "Creating new realtime tuned profile on cluster"
oc create -f - <<EOF
apiVersion: tuned.openshift.io/v1
kind: Tuned
metadata:
name: ${node_role}-rt
namespace: openshift-cluster-node-tuning-operator
spec:
profile:
- data: |
[main]
summary=Openshift node optimized for deterministic performance at the cost of increased power consumption, focused on low latency network performance. Based on Tuned 2.11 and Cluster node tuning (oc 4.5)
include=openshift-node,cpu-partitioning
[variables]
isolated_cores=$isolated_cpu-$(( max_cpu - 1))
not_isolated_cores_expanded=\${f:cpulist_invert:\${isolated_cores_expanded}}
[cpu]
force_latency=cstate.id:1|3
governor=performance
energy_perf_bias=performance
min_perf_pct=100
[service]
$stalld_service
[vm]
transparent_hugepages=never
[irqbalance]
banned_cpus=""
[scheduler]
runtime=0
group.ksoftirqd=0:f:11:*:ksoftirqd.*
group.rcuc=0:f:11:*:rcuc.*
default_irq_smp_affinity = ignore
[sysctl]
kernel.hung_task_timeout_secs = 600
kernel.nmi_watchdog = 0
kernel.sched_rt_runtime_us = $sched_rt_runtime_us
kernel.timer_migration = 0
kernel.numa_balancing=0
net.core.busy_read=50
net.core.busy_poll=50
net.ipv4.tcp_fastopen=3
vm.stat_interval = 10
kernel.sched_min_granularity_ns=10000000
vm.dirty_ratio=10
vm.dirty_background_ratio=3
vm.swappiness=10
kernel.sched_migration_cost_ns=5000000
[selinux]
avc_cache_threshold=8192
[net]
nf_conntrack_hashsize=131072
[bootloader]
# set empty values to disable RHEL initrd setting in cpu-partitioning
initrd_remove_dir=
initrd_dst_img=
initrd_add_dir=
# overrides cpu-partitioning cmdline
cmdline_cpu_part=+nohz=on rcu_nocbs=\${isolated_cores} tuned.non_isolcpus=\${not_isolated_cpumask} intel_pstate=disable nosoftlockup
cmdline_realtime=+tsc=nowatchdog intel_iommu=on iommu=pt isolcpus=managed_irq,\${isolated_cores} systemd.cpu_affinity=\${not_isolated_cores_expanded}
cmdline_additionalArg=+ nmi_watchdog=0 audit=0 mce=off processor.max_cstate=1 idle=poll intel_idle.max_cstate=0 nohz_full=\${isolated_cores}
name: openshift-node-performance-manual
recommend:
- machineConfigLabels:
machineconfiguration.openshift.io/role: ${node_role}
operand:
debug: false
priority: 20
profile: openshift-node-performance-manual
EOF
echo "waiting for mcp/${node_role} condition=Updating timeout=5m"
oc wait mcp/${node_role} --for condition=Updating --timeout=5m
# When applying this configuration to the master node for single node
# we need to wait for the master node to restart before we attempt to call out to the API Server
if [[ "$node_role" == "master" ]]; then
seconds=600
echo "master node is updating, waiting $seconds seconds for master node to restart"
sleep $seconds
fi
echo "waiting for mcp/${node_role} condition=Updated timeout=30m"
oc wait mcp/${node_role} --for condition=Updated --timeout=30m
seconds=120
echo "waiting $seconds seconds to give some delay before collecting metrics"
sleep $seconds