Skip to content

Commit bcff128

Browse files
committed
Band-aid and test the crash of the account server
We have a better fix in the works, see the change Ic53068867feb0c18c88ddbe029af83a970336545. But it is taking too long to coalesce and users are unhappy right now. Related: rhbz#1838242, rhbz#1965348 Change-Id: I3f7bfc2877355b7cb433af77c4e2dfdfa94ff14d
1 parent 901d2e1 commit bcff128

File tree

6 files changed

+196
-34
lines changed

6 files changed

+196
-34
lines changed

swift/account/server.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -185,8 +185,9 @@ def PUT(self, req):
185185
broker.initialize(timestamp.internal)
186186
except DatabaseAlreadyExists:
187187
pass
188-
if req.headers.get('x-account-override-deleted', 'no').lower() != \
189-
'yes' and broker.is_deleted():
188+
if (req.headers.get('x-account-override-deleted', 'no').lower() !=
189+
'yes' and broker.is_deleted()) \
190+
or not os.path.exists(broker.db_file):
190191
return HTTPNotFound(request=req)
191192
broker.put_container(container, req.headers['x-put-timestamp'],
192193
req.headers['x-delete-timestamp'],

swift/container/updater.py

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -271,9 +271,13 @@ def process_container(self, dbfile):
271271
info['storage_policy_index'])
272272
for node in nodes]
273273
successes = 0
274+
stub404s = 0
274275
for event in events:
275-
if is_success(event.wait()):
276+
result = event.wait()
277+
if is_success(result):
276278
successes += 1
279+
if result == 404:
280+
stub404s += 1
277281
if successes >= majority_size(len(events)):
278282
self.logger.increment('successes')
279283
self.successes += 1
@@ -283,6 +287,15 @@ def process_container(self, dbfile):
283287
broker.reported(info['put_timestamp'],
284288
info['delete_timestamp'], info['object_count'],
285289
info['bytes_used'])
290+
elif stub404s == len(events):
291+
self.logger.increment('failures')
292+
self.failures += 1
293+
self.logger.debug(
294+
_('Update report stub for %(container)s %(dbfile)s'),
295+
{'container': container, 'dbfile': dbfile})
296+
broker.quarantine('no account replicas exist')
297+
# All that's left at this point is a few sacks of Gnocchi,
298+
# easily collected by the dark data watcher in object auditor.
286299
else:
287300
self.logger.increment('failures')
288301
self.failures += 1

test/probe/common.py

Lines changed: 26 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -36,8 +36,8 @@
3636
from swift.common import internal_client, direct_client, utils
3737
from swift.common.direct_client import DirectClientException
3838
from swift.common.ring import Ring
39-
from swift.common.utils import readconf, renamer, \
40-
rsync_module_interpolation, md5
39+
from swift.common.utils import hash_path, md5, \
40+
readconf, renamer, rsync_module_interpolation
4141
from swift.common.manager import Manager
4242
from swift.common.storage_policy import POLICIES, EC_POLICY, REPL_POLICY
4343
from swift.obj.diskfile import get_data_dir
@@ -582,6 +582,13 @@ def run_custom_daemon(self, klass, conf_section, conf_index,
582582
return daemon
583583

584584

585+
def _get_db_file_path(obj_dir):
586+
files = sorted(os.listdir(obj_dir), reverse=True)
587+
for filename in files:
588+
if filename.endswith('db'):
589+
return os.path.join(obj_dir, filename)
590+
591+
585592
class ReplProbeTest(ProbeTest):
586593

587594
acct_cont_required_replicas = 3
@@ -625,6 +632,23 @@ def direct_get_container(self, account=None, container=None,
625632
return self.direct_container_op(direct_client.direct_get_container,
626633
account, container, expect_failure)
627634

635+
def get_container_db_files(self, container):
636+
opart, onodes = self.container_ring.get_nodes(self.account, container)
637+
onode = onodes[0]
638+
db_files = []
639+
for onode in onodes:
640+
node_id = (onode['port'] % 100) // 10
641+
device = onode['device']
642+
hash_str = hash_path(self.account, container)
643+
server_conf = readconf(self.configs['container-server'][node_id])
644+
devices = server_conf['app:container-server']['devices']
645+
obj_dir = '%s/%s/containers/%s/%s/%s/' % (devices,
646+
device, opart,
647+
hash_str[-3:], hash_str)
648+
db_files.append(_get_db_file_path(obj_dir))
649+
650+
return db_files
651+
628652

629653
class ECProbeTest(ProbeTest):
630654

test/probe/test_container_failures.py

Lines changed: 2 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,6 @@
1414
# See the License for the specific language governing permissions and
1515
# limitations under the License.
1616

17-
from os import listdir
18-
from os.path import join as path_join
1917
from unittest import main
2018
from uuid import uuid4
2119

@@ -26,20 +24,13 @@
2624

2725
from swift.common import direct_client
2826
from swift.common.exceptions import ClientException
29-
from swift.common.utils import hash_path, readconf
27+
from swift.common.utils import readconf
3028
from test.probe.common import kill_nonprimary_server, \
3129
kill_server, ReplProbeTest, start_server
3230

3331
eventlet.monkey_patch(all=False, socket=True)
3432

3533

36-
def get_db_file_path(obj_dir):
37-
files = sorted(listdir(obj_dir), reverse=True)
38-
for filename in files:
39-
if filename.endswith('db'):
40-
return path_join(obj_dir, filename)
41-
42-
4334
class TestContainerFailures(ReplProbeTest):
4435

4536
def test_one_node_fails(self):
@@ -154,23 +145,6 @@ def test_all_nodes_fail(self):
154145
client.put_object(self.url, self.token, container1, 'obj3', 'x')
155146
self.assertEqual(caught.exception.http_status, 503)
156147

157-
def _get_container_db_files(self, container):
158-
opart, onodes = self.container_ring.get_nodes(self.account, container)
159-
onode = onodes[0]
160-
db_files = []
161-
for onode in onodes:
162-
node_id = (onode['port'] % 100) // 10
163-
device = onode['device']
164-
hash_str = hash_path(self.account, container)
165-
server_conf = readconf(self.configs['container-server'][node_id])
166-
devices = server_conf['app:container-server']['devices']
167-
obj_dir = '%s/%s/containers/%s/%s/%s/' % (devices,
168-
device, opart,
169-
hash_str[-3:], hash_str)
170-
db_files.append(get_db_file_path(obj_dir))
171-
172-
return db_files
173-
174148
def test_locked_container_dbs(self):
175149

176150
def run_test(num_locks, catch_503):
@@ -179,7 +153,7 @@ def run_test(num_locks, catch_503):
179153
# Get the container info into memcache (so no stray
180154
# get_container_info calls muck up our timings)
181155
client.get_container(self.url, self.token, container)
182-
db_files = self._get_container_db_files(container)
156+
db_files = self.get_container_db_files(container)
183157
db_conns = []
184158
for i in range(num_locks):
185159
db_conn = connect(db_files[i])

test/probe/test_object_failures.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,7 @@ def _setup_data_file(self, container, obj, data):
6363
opart, onodes = self.object_ring.get_nodes(
6464
self.account, container, obj)
6565
onode = onodes[0]
66-
node_id = (onode['port'] % 100) / 10
66+
node_id = (onode['port'] % 100) // 10
6767
device = onode['device']
6868
hash_str = hash_path(self.account, container, obj)
6969
obj_server_conf = readconf(self.configs['object-server'][node_id])

test/probe/test_orphan_container.py

Lines changed: 150 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,150 @@
1+
#!/usr/bin/python -u
2+
# Copyright (c) 2010-2012 OpenStack Foundation
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
13+
# implied.
14+
# See the License for the specific language governing permissions and
15+
# limitations under the License.
16+
17+
import os
18+
19+
from swiftclient import client
20+
from unittest import main
21+
22+
from swift.common.exceptions import LockTimeout
23+
from swift.common.manager import Manager
24+
from swift.common.utils import hash_path, readconf, Timestamp
25+
from swift.container.backend import ContainerBroker
26+
27+
from test.probe.common import (
28+
kill_nonprimary_server, kill_server, start_server, ReplProbeTest)
29+
30+
# Why is this not called test_container_orphan? Because the crash
31+
# happens in the account server, so both account and container
32+
# services are involved.
33+
#
34+
# The common way users do this is to use TripleO to deploy an overcloud
35+
# and add Gnocchi. Gnocchi is hammering Swift, its container has updates
36+
# all the time. Then, users crash the overcloud and re-deploy it,
37+
# using the new suffix in swift.conf. Thereafter, container service
38+
# inherits old container with outstanding updates, container updater
39+
# tries to send updates to the account server, while the account cannot
40+
# be found anymore. In this situation, in Swift 2.25.0, account server
41+
# tracebacks, and the cycle continues without end.
42+
43+
44+
class TestOrphanContainer(ReplProbeTest):
45+
46+
def get_account_db_files(self, account):
47+
48+
# This is "more correct" than (port_num%100)//10, but is it worth it?
49+
# We have the assumption about port_num vs node_id embedded all over.
50+
account_configs = {}
51+
for _, cname in self.configs['account-server'].items():
52+
conf = readconf(cname)
53+
# config parser cannot know if it's a number or not, so int()
54+
port = int(conf['app:account-server']['bind_port'])
55+
account_configs[port] = conf
56+
57+
part, nodes = self.account_ring.get_nodes(account)
58+
hash_str = hash_path(account)
59+
60+
ret = []
61+
for node in nodes:
62+
63+
data_dir = 'accounts'
64+
device = node['device']
65+
conf = account_configs[node['port']]
66+
devices = conf['app:account-server']['devices']
67+
68+
# os.path.join is for the weak
69+
db_file = '%s/%s/%s/%s/%s/%s/%s.db' % (
70+
devices, device, data_dir, part,
71+
hash_str[-3:], hash_str, hash_str)
72+
ret.append(db_file)
73+
return ret
74+
75+
def test_update_pending(self):
76+
77+
# Create container
78+
container = 'contx'
79+
client.put_container(self.url, self.token, container)
80+
81+
part, nodes = self.account_ring.get_nodes(self.account)
82+
anode = nodes[0]
83+
84+
# Stop a quorum of account servers
85+
# This allows the put to continue later.
86+
kill_nonprimary_server(nodes, self.ipport2server)
87+
kill_server((anode['ip'], anode['port']), self.ipport2server)
88+
89+
# Put object
90+
# This creates an outstanding update.
91+
client.put_object(self.url, self.token, container, 'object1', b'123')
92+
93+
cont_db_files = self.get_container_db_files(container)
94+
self.assertEqual(len(cont_db_files), 3)
95+
96+
# Collect the observable state from containers
97+
outstanding_files = []
98+
for cfile in cont_db_files:
99+
broker = ContainerBroker(cfile)
100+
try:
101+
info = broker.get_info()
102+
except LockTimeout:
103+
self.fail('LockTimeout at %s' % (cfile,))
104+
if Timestamp(info['put_timestamp']) <= 0:
105+
self.fail('No put_timestamp at %s' % (cfile,))
106+
# Correct even if reported_put_timestamp is zero.
107+
if info['put_timestamp'] > info['reported_put_timestamp']:
108+
outstanding_files.append(cfile)
109+
self.assertGreater(len(outstanding_files), 0)
110+
111+
# At this point the users shut everything down and screw up the
112+
# hash in swift.conf. But we destroy the account DB instead.
113+
files = self.get_account_db_files(self.account)
114+
for afile in files:
115+
os.unlink(afile)
116+
117+
# Restart the stopped primary server
118+
start_server((anode['ip'], anode['port']), self.ipport2server)
119+
120+
# Make sure updaters run
121+
Manager(['container-updater']).once()
122+
123+
# Collect the observable state from containers again and examine it
124+
outstanding_files_new = []
125+
for cfile in cont_db_files:
126+
127+
# We aren't catching DatabaseConnectionError, because
128+
# we only want to approve of DBs that were quarantined,
129+
# and not otherwise damaged. So if the code below throws
130+
# an exception for other reason, we want the test to fail.
131+
if not os.path.exists(cfile):
132+
continue
133+
134+
broker = ContainerBroker(cfile)
135+
try:
136+
info = broker.get_info()
137+
except LockTimeout:
138+
self.fail('LockTimeout at %s' % (cfile,))
139+
if Timestamp(info['put_timestamp']) <= 0:
140+
self.fail('No put_timestamp at %s' % (cfile,))
141+
# Correct even if reported_put_timestamp is zero.
142+
if info['put_timestamp'] > info['reported_put_timestamp']:
143+
outstanding_files_new.append(cfile)
144+
self.assertLengthEqual(outstanding_files_new, 0)
145+
146+
self.get_to_final_state()
147+
148+
149+
if __name__ == '__main__':
150+
main()

0 commit comments

Comments
 (0)