This repository has been archived by the owner on Jan 31, 2019. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 15
/
deploy-socorro.sh
executable file
·400 lines (380 loc) · 16.7 KB
/
deploy-socorro.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
#!/bin/bash
############################
# COMMON FUNCTIONS AND SETTINGS
# Get incoming arguments
REQUIREDARGS=1
ENVNAME=$1
NUMOFARGS=$#
if echo $2 | grep "skiprpm" > /dev/null;then
echo "`date` -- Skipping RPM for this run"
SKIPRPM="true"
fi
if echo $3 | grep "skipami" > /dev/null;then
NEWAMI="ami-19102029" #just random
SKIPAMI="true"
fi
if [ "$2" = "manual" ];then
MANUALPUSH="true" # We're just doing a deploy by HASH
HASHTODEPLOY=${GITHUBHASH} # This will be build with parameters for git commit has to build in jenkins
else
MANUALPUSH="false"
fi
GITPAYLOAD=${payload}
RANDOM_STRING="$RANDOM-$RANDOM"
STARTLOG=/var/log/jenkins/${RANDOM_STRING}-startlog.out
ENDLOG=/var/log/jenkins/${RANDOM_STRING}-endlog.out
ENDRETURNCODE=0
# Get AWS creds injected
. /home/centos/.aws-config
# Bring in functions in lib
. /home/centos/socorro-infra/bin/lib/identify_role.sh
. /home/centos/socorro-infra/bin/lib/create_rpm.sh
. /home/centos/socorro-infra/bin/lib/create_ami.sh
. /home/centos/socorro-infra/bin/lib/infra_status.sh
# Show usage instructions for noobs
function show_usage() {
echo " =================== ";echo " ";
echo " Syntax "
echo " ./deploy-socorro.sh $env [manual](optional) [$githash-to-deploy](optional)"
echo " "
echo " Example: ./deploy-socorro.sh stage"
exit 0
}
# Check if you are asking for halp or passing the wrong number of args
function check_syntax() {
if [ "$1" = "-h" ] || [ "$1" = "--help" ];then
show_usage
fi
if [ ${NUMOFARGS} -lt ${REQUIREDARGS} ];then
echo "`date` -- ERROR, ${REQUIREDARGS} arguments required and ${NUMOFARGS} is an invalid number of args"
exit 1
fi
}
function error_check() {
if [ ${RETURNCODE} -ne 0 ];then
echo "`date` -- Error encountered during ${PROGSTEP} : ${RETURNCODE}"
if [ "${NOTFATAL}" = "true" ];then
echo "Not fatal, continuing"
NOTFATAL="false"
else
echo "Fatal, exiting"
echo "===================="
echo "Instances which may need to be terminated manually: ${INITIALINSTANCES}"
exit 1
fi
fi
}
####################################
## PROGRAM FUNCTIONS
function format_logs() {
echo " ";echo " ";echo "====================";
environment_banner
echo " ====================";echo " ";echo " "
}
function environment_banner() {
if [ "$ENVNAME" == "stage" ]; then
echo ' _____ _'
echo '/ ___| |'
echo '\ `--.| |_ __ _ __ _ ___'
echo ' `--. \ __/ _` |/ _` |/ _ \ '
echo '/\__/ / || (_| | (_| | __/'
echo '\____/ \__\__,_|\__, |\___|'
echo ' __/ |'
echo ' |___/'
fi
if [ "$ENVNAME" == "prod" ]; then
echo '______ _'
echo '| ___ \ | |'
echo '| |_/ / __ ___ __| |'
echo '| __/ `__/ _ \ / _` |'
echo '| | | | | (_) | (_| |'
echo '\_| |_| \___/ \__,_|'
fi
}
function parse_github_payload() {
PROGSTEP="Parse github webhook payload"
# We will get all sorts of fun info here!
echo "Git Payload for copying into jenkins build parameter"
echo " ";echo ${GITPAYLOAD};echo " "
GITCOMMITHASH=$(echo ${GITPAYLOAD} | jq -r '.commits[0].id')
echo "Git commit hash: ${GITCOMMITHASH}"
GITREF=$(echo ${GITPAYLOAD} | jq -r '.ref')
echo "Git Ref: ${GITREF}"
if echo ${GITREF} | grep tag > /dev/null;then
GITTAG=$GITREF
GITCOMMITHASH=$(echo ${GITPAYLOAD} | jq -r '.head_commit.id')
ENVNAME="prod";format_logs
echo "`date` -- Tag detected, # ${GITTAG}"
else
if [ "${MANUALPUSH}" = "true" ];then
GITCOMMITHASH=${GITHUBHASH}
else
GITTAG="false"
ENVNAME="stage";format_logs
echo "`date` -- No tag detected, no manual build flag, env must be stage"
fi
fi
echo "Git Tag: ${GITTAG}"
GITCOMMITTER=$(echo ${GITPAYLOAD} | sed 's/,/\'$'\n/g'| grep committer | \
sed 's/"/ /g'|awk '{print $5" "$6}')
echo "Committer: ${GITCOMMITTER}"
}
function scale_in_per_elb() {
PROGSTEP="Scale in";format_logs
echo "`date` -- Checking current desired capacity of autoscaling group for ${ROLEENVNAME}"
# We'll set the initial capacity and go back to that at the end
INITIALCAPACITY=$(aws autoscaling describe-auto-scaling-groups \
--auto-scaling-group-names ${AUTOSCALENAME} \
--output text \
--query 'AutoScalingGroups[*].DesiredCapacity')
RETURNCODE=$?;error_check
echo "`date` -- ${AUTOSCALENAME} initial capacity is set to ${INITIALCAPACITY}"
# How many new nodes will we need to scale in for this deploy?
DEPLOYCAPACITY=`echo $(($INITIALCAPACITY*2))`
RETURNCODE=$?;error_check
echo "`date` -- ${AUTOSCALENAME} capacity for this deploy is set to ${DEPLOYCAPACITY}"
# Get a list of existing instance ids to terminate later
INITIALINSTANCES="${INITIALINSTANCES} $(aws autoscaling \
describe-auto-scaling-groups \
--auto-scaling-group-name ${AUTOSCALENAME} \
--output text \
--query 'AutoScalingGroups[*].Instances[*].InstanceId')"
RETURNCODE=$?;error_check
echo "`date` -- Current instance list: ${INITIALINSTANCES}"
# Tell the AWS api to give us more instances in that role env.
aws autoscaling update-auto-scaling-group --auto-scaling-group-name ${AUTOSCALENAME} --min-size ${DEPLOYCAPACITY}
RETURNCODE=$?;error_check
echo "`date` -- Minimum capacity set with a return code of ${RETURNCODE}"
aws autoscaling set-desired-capacity --auto-scaling-group-name ${AUTOSCALENAME} --desired-capacity ${DEPLOYCAPACITY}
RETURNCODE=$?;error_check
echo "`date` -- Desired capacity set with a return code of ${RETURNCODE}"
}
function check_health_per_elb() {
PROGSTEP="Checking ELB status for ${ELBNAME}"
# We'll want to ensure the number of healthy hosts is equal to total number of hosts
ASCAPACITY=$(aws autoscaling describe-auto-scaling-groups \
--auto-scaling-group-names ${AUTOSCALENAME} \
--output text \
--query 'AutoScalingGroups[*].DesiredCapacity')
RETURNCODE=$?;error_check
HEALTHYHOSTCOUNT=$(aws elb describe-instance-health \
--load-balancer-name ${ELBNAME} \
--output text | awk '{print $5}' | \
grep InService | wc -l | awk '{print $1}')
RETURNCODE=$?;error_check
CURRENTHEALTH="${CURRENTHEALTH} $(aws elb describe-instance-health \
--load-balancer-name ${ELBNAME} \
--output text | awk '{print $5}')"
RETURNCODE=$?;error_check
if [ ${HEALTHYHOSTCOUNT} -lt ${ASCAPACITY} ];then
CURRENTHEALTH="Out"
fi
echo "`date` -- ${AUTOSCALENAME} nodes healthy in ELB: ${HEALTHYHOSTCOUNT} / ${ASCAPACITY}"
}
function scale_in_all() {
PROGSTEP="Scaling in all the nodes"
# Each socorro env has its own master list in ./lib. We iterate over that list
# to scale up and identify nodes to kill later
for ROLEENVNAME in $(cat /home/centos/socorro-infra/bin/lib/${ENVNAME}_socorro_master.list)
do
identify_role ROLEENVNAME
scale_in_per_elb
done
}
function monitor_overall_health() {
PROGSTEP="Waiting for all ELBs to report healthy and full"
# If any elb is still unhealthy, we don't want to kill nodes
ATTEMPTCOUNT=0;NOHEALTHALERT=""
until [ "${HEALTHSTATUS}" = "ALLHEALTHY" ]
do
ATTEMPTCOUNT=`echo $(($ATTEMPTCOUNT+1))`
echo "`date` -- Attempt ${ATTEMPTCOUNT} of 15 checking on healthy elbs"
for ROLEENVNAME in $(cat /home/centos/socorro-infra/bin/lib/${ENVNAME}_socorro_master.list)
do
# Get the AS name and ELB name for this particular role/env
identify_role ROLEENVNAME
if [ "${ELBNAME}" = "NONE" ];then
echo "No elb to check for ${ROLEENVNAME}" > /dev/null
else
check_health_per_elb
fi
done
# Check for OutOfService in the saved string of statuses. If it exists, reset and wait.
if echo ${CURRENTHEALTH} | grep Out > /dev/null;then
CURRENTHEALTH=""
sleep 60 # We want to be polite to the API
if [ $ATTEMPTCOUNT -gt 14 ]; then
echo "`date` -- ALERT! We've tried for 15 minutes to wait for healthy nodes, continuing"
NOHEALTHALERT="true"
HEALTHSTATUS="ALLHEALTHY"
fi
else
echo "`date` -- ELBs are now healthy"
HEALTHSTATUS="ALLHEALTHY"
fi
done
if [ "${HEALTHSTATUS}" = "ALLHEALTHY" ] && [ "${NOHEALTHALERT}" = "" ];then
echo "`date` -- We are bonafide healthy"
fi
}
function instance_deregister() {
# We check to see if each instance in a given ELB is one of the doomed nodes
if echo ${INITIALINSTANCES} | grep $1 > /dev/null;then
aws elb deregister-instances-from-load-balancer \
--load-balancer-name $2 \
--instances $1
RETURNCODE=$?;NOTFATAL="true";error_check
echo "`date` -- Attempt to deregister $1 from $2 returned a code of ${RETURNCODE}"
fi
}
function deregister_elb_nodes() {
# We'll list every ELB involved, and for each, list every instance. Then, for each instance
# we check if it is on the doomed list. If so, we deregister it to allow a 30s drain
PROGSTEP="Starting scale down"
echo "`date` -- ${PROGSTEP}"
for ROLEENVNAME in $(cat /home/centos/socorro-infra/bin/lib/${ENVNAME}_socorro_master.list)
do
# Get ELB and AS group name.
identify_role ${ROLEENVNAME}
if [ "${ELBNAME}" = "NONE" ];then
echo "No ELB to check for ${ROLEENVNAME}"
else
echo "Deregistering initial instances from ${ELBNAME}"
# For every instance in $ELBNAME, check if it's slated to be killed.
INSTANCES=$(aws elb describe-instance-health \
--load-balancer-name ${ELBNAME} \
--output text --query 'InstanceStates[*].InstanceId')
for INSTANCETOCHECK in $INSTANCES; do
instance_deregister ${INSTANCETOCHECK} ${ELBNAME}
done
fi
done
echo "`date` -- All instances in ELBs deregistered, waiting for the 30 second drain period"
}
function terminate_instances() {
# We iterate over the list of instances to terminate (${INITIALINSTANCES}) and send each one here to
# be terminated and simultaneously drop the desired-capacity down by 1.
aws autoscaling terminate-instance-in-auto-scaling-group --instance-id $1 --should-decrement-desired-capacity
RETURNCODE=$?
echo "`date` -- $1 termination return code of ${RETURNCODE}"
}
function find_ami() {
PROGSTEP="Find correct prod AMI"
echo "`date` -- Attempting to locate AMI tagged with appsha ${GITCOMMITHASH}"
NEWAMI=$(aws ec2 describe-images \
--filters Name=tag:apphash,Values=${GITCOMMITHASH} \
--output text --query 'Images[0].ImageId')
RETURNCODE=$?;error_check
if [[ -z "$NEWAMI" ]]; then
echo "`date` -- AMI for appsha ${GITCOMMITHASH} could not be found."
RETURNCODE=1;error_check
fi
echo "`date` -- AMI id ${NEWAMI} found tagged with appsha ${GITCOMMITHASH}"
}
function apply_ami() {
PROGSTEP="Apply AMI using Terraform"
# For each of our apps, we want to use terraform to apply the new base AMI we've just created
for ROLEENVNAME in $(cat /home/centos/socorro-infra/bin/lib/${ENVNAME}_socorro_master.list)
do
# Get AS group name for each ROLEENVNAME
echo "`date` -- Checking role for ${ROLEENVNAME}"
identify_role ${ROLEENVNAME}
infra_report ${AUTOSCALENAME} >> ${STARTLOG}
ASCAPACITY=$(aws autoscaling describe-auto-scaling-groups \
--auto-scaling-group-names ${AUTOSCALENAME} \
--output text \
--query 'AutoScalingGroups[*].DesiredCapacity')
cd /home/centos/socorro-infra/terraform
echo "`date` -- Attempting to terraform plan and apply ${AUTOSCALENAME} with new AMI id ${NEWAMI} and tagging with ${SOCORROHASH}"
/home/centos/socorro-infra/terraform/wrapper.sh "plan -var base_ami={us-west-2=\"${NEWAMI}\"} -var ${SCALEVARIABLE}=${ASCAPACITY}" ${ENVNAME} ${TERRAFORMNAME}
RETURNCODE=$?;error_check
echo " ";echo " ";echo "==================================";echo " "
/home/centos/socorro-infra/terraform/wrapper.sh "apply -var base_ami={us-west-2=\"${NEWAMI}\"} -var ${SCALEVARIABLE}=${ASCAPACITY}" ${ENVNAME} ${TERRAFORMNAME}
RETURNCODE=$?;error_check
echo "`date` -- Got return code ${RETURNCODE} applying terraform update"
done
echo "`date` -- All AMI's updated"
}
function terminate_instances_all() {
PROGSTEP="Terminating instances"
for ROLEENVNAME in $(cat /home/centos/socorro-infra/bin/lib/${ENVNAME}_socorro_master.list)
do
# First, we halve the number of minimum size for each group
echo "`date` -- Setting min size for ${ROLEENVNAME}"
identify_role ${ROLEENVNAME}
ASCAPACITY=$(aws autoscaling describe-auto-scaling-groups \
--auto-scaling-group-names ${AUTOSCALENAME} \
--output text \
--query 'AutoScalingGroups[*].DesiredCapacity')
SCALEDOWNCAPACITY=$(echo $(($ASCAPACITY/2)))
if [ ${SCALEDOWNCAPACITY} -lt 1 ]; then
SCALEDOWNCAPACITY=1
fi
# Scale back to half current min size, unless that'd bring us to 0
echo "`date` -- Setting ${AUTOSCALENAME} from ${ASCAPACITY} min size to ${SCALEDOWNCAPACITY} to prep for instance killings"
aws autoscaling update-auto-scaling-group --auto-scaling-group-name ${AUTOSCALENAME} --min-size ${SCALEDOWNCAPACITY}
RETURNCODE=$?;error_check
done
echo "`date` -- Shooting servers in the face"
# With the list we built earlier of old instances, iterate over it and terminate/decrement
for doomedinstances in $(echo ${INITIALINSTANCES} )
do
terminate_instances $doomedinstances
done
}
function query_end_scale() {
echo "END STATE FOR AUTO SCALING GROUPS"
for ROLEENVNAME in $(cat /home/centos/socorro-infra/bin/lib/${ENVNAME}_socorro_master.list);do
identify_role ${ROLEENVNAME}
infra_report ${AUTOSCALENAME} >> ${ENDLOG}
done
}
####################################
## PROGRAM RUN
check_syntax # Scan for noobs
parse_github_payload # We check the incoming payload for some info
if [ "${ENVNAME}" = "stage" ] && [ "${MANUALPUSH}" = "false" ];then
time create_rpm; format_logs # Git clone, build, package with fpm, sign, and upload rpm to S3 public repo
time create_ami; format_logs # Use packer to create an AMI to use as base
fi
if [ "${ENVNAME}" = prod ] || [ "${MANUALPUSH}" = "true" ];then
time find_ami; format_logs
fi
time apply_ami; format_logs # For each ROLEENV group, set the baseAMI to the AMI we created
time scale_in_all; format_logs # For each ROLEENV group, double the size and get a list of old instances to kill later
sleep 60 # Give API time to update the instances counts it returns
time monitor_overall_health; format_logs # After updates go out, monitor for all instances in all elbs to be healthy.
if [ "${NOHEALTHALERT}" = "true" ];then
# This means earlier, the health check process tried 10 times, and never got an all healthy response
echo "`date` -- Not going to scale out, since we aren't all healthy"
ENDRETURNCODE=1
else
time deregister_elb_nodes; format_logs # For anything with an elb, dereg instances to allow for connection drain
sleep 30 # Wait for drain, default we set is to 30s
time terminate_instances_all; format_logs # Kill the instances we listed in ${INITIALINSTANCES}
fi
# All done, get our report.
echo "`date` -- Deployment complete"
echo "Nodes we think should have been killed:"
echo ${INITIALINSTANCES}
format_logs
query_end_scale; format_logs # What did our groups end at?
echo "New Socorro RPM Version: ${NEWSOCORROVERSION}"
echo "New AMI Name: ${SOCORROAMINAME}"
echo "New AMI ID: ${NEWAMI}"
echo "New AMI Hash Tag: ${GITCOMMITHASH}"
if [ "$GITTAG" = "false" ];then
echo "No tag"
else
echo "New git tag: ${GITTAG}"
fi
echo "Push triggered by: ${GITCOMMITTER}"
format_logs
echo "========== BEGINNING STATE =========="
cat ${STARTLOG}
format_logs
echo "========== ENDING STATE ==========="
cat ${ENDLOG}
rm ${STARTLOG}
rm ${ENDLOG}
exit ${ENDRETURNCODE} # You snazzy, snazzy engineer. You did it! (Probably)