-
Notifications
You must be signed in to change notification settings - Fork 12
/
test_database_store.py
221 lines (172 loc) · 8.96 KB
/
test_database_store.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
import logging
import os
from datetime import datetime
from unittest.mock import Mock
import psycopg2
import pytest
from ocdsmerge.exceptions import DuplicateIdValueWarning
from scrapy.exceptions import NotConfigured
from kingfisher_scrapy.extensions import DatabaseStore, FilesStore
from tests import spider_with_crawler
DATABASE_URL = os.getenv('KINGFISHER_COLLECT_DATABASE_URL')
SKIP_TEST_IF = not DATABASE_URL and ('CI' not in os.environ or 'CI_SKIP' in os.environ)
@pytest.fixture
def cursor():
connection = psycopg2.connect(DATABASE_URL)
cursor = connection.cursor()
try:
yield cursor
finally:
cursor.execute('DROP TABLE IF EXISTS test')
connection.commit()
cursor.close()
connection.close()
def test_from_crawler_missing_arguments():
spider = spider_with_crawler(crawl_time='2021-05-25T00:00:00')
with pytest.raises(NotConfigured) as excinfo:
DatabaseStore.from_crawler(spider.crawler)
assert str(excinfo.value) == 'DATABASE_URL is not set.'
spider.crawler.settings = {'DATABASE_URL': 'test', 'FILES_STORE': None}
with pytest.raises(NotConfigured) as excinfo:
DatabaseStore.from_crawler(spider.crawler)
assert str(excinfo.value) == 'FILES_STORE is not set.'
@pytest.mark.skipif(SKIP_TEST_IF, reason='KINGFISHER_COLLECT_DATABASE_URL must be set')
@pytest.mark.parametrize('from_date,default_from_date,messages', [
(None, None, ['Getting the date from which to resume the crawl from the test table']),
(None, '2020-01-01', ['Getting the date from which to resume the crawl from the test table']),
('2020-01-01', None, []),
('2020-01-01', '2020-01-01', []),
])
def test_spider_opened_no_resume(cursor, caplog, tmpdir, from_date, default_from_date, messages):
spider = spider_with_crawler(crawl_time='2021-05-25T00:00:00',
settings={'DATABASE_URL': DATABASE_URL, 'FILES_STORE': tmpdir})
spider.from_date = from_date
spider.default_from_date = default_from_date
extension = DatabaseStore.from_crawler(spider.crawler)
with caplog.at_level(logging.INFO):
extension.spider_opened(spider)
cursor.execute("SELECT to_regclass('test')")
table_exists = cursor.fetchone()[0]
assert table_exists == 'test'
assert spider.from_date == from_date
assert [record.message for record in caplog.records] == messages
@pytest.mark.skipif(SKIP_TEST_IF, reason='KINGFISHER_COLLECT_DATABASE_URL must be set')
def test_spider_opened_resume(caplog, tmpdir):
spider = spider_with_crawler(crawl_time='2021-05-25T00:00:00',
settings={'DATABASE_URL': DATABASE_URL, 'FILES_STORE': tmpdir})
spider.data_type = 'release_package'
extension = DatabaseStore.from_crawler(spider.crawler)
files_store_extension = FilesStore.from_crawler(spider.crawler)
response = Mock()
response.body = b'{"releases": [{"date": "2021-05-26T10:00:00Z"}]}'
response.request = Mock()
response.request.url = 'https://example.com/remote.json'
response.request.meta = {'file_name': 'file.json'}
item = spider.build_file_from_response(response, file_name='file.json', data_type='release_package')
files_store_extension.item_scraped(item, spider)
extension.spider_opened(spider)
caplog.clear()
extension.spider_closed(spider, 'finished')
with caplog.at_level(logging.INFO):
extension.spider_opened(spider)
assert spider.from_date == datetime(2021, 5, 26, 0, 0)
assert [record.message for record in caplog.records] == [
'Getting the date from which to resume the crawl from the test table',
'Resuming the crawl from 2021-05-26',
]
@pytest.mark.skipif(SKIP_TEST_IF, reason='KINGFISHER_COLLECT_DATABASE_URL must be set')
def test_spider_closed_warnings(caplog, tmpdir):
spider = spider_with_crawler(crawl_time='2021-05-25T00:00:00',
settings={'DATABASE_URL': DATABASE_URL, 'FILES_STORE': tmpdir})
spider.data_type = 'release_package'
spider.compile_releases = True
extension = DatabaseStore.from_crawler(spider.crawler)
files_store_extension = FilesStore.from_crawler(spider.crawler)
response = Mock()
response.body = b'{"releases":[{"ocid":"x","date":"2021-05-26T10:00:00Z","parties":[{"id":"x"},{"id":"x"}]}]}'
response.request = Mock()
response.request.url = 'https://example.com/remote.json'
response.request.meta = {'file_name': 'file.json'}
item = spider.build_file_from_response(response, file_name='file-x.json', data_type='release_package')
files_store_extension.item_scraped(item, spider)
response.body = b'{"releases":[{"ocid":"y","date":"2021-05-26T10:00:00Z","parties":[{"id":"y"}]}]}'
item = spider.build_file_from_response(response, file_name='file-y.json', data_type='release_package')
files_store_extension.item_scraped(item, spider)
response.body = b'{"releases":[{"ocid":"z","date":"2021-05-26T10:00:00Z","parties":[{"id":"z"},{"id":"z"}]}]}'
item = spider.build_file_from_response(response, file_name='file-z.json', data_type='release_package')
files_store_extension.item_scraped(item, spider)
extension.spider_opened(spider)
caplog.clear()
with pytest.warns(DuplicateIdValueWarning) as records:
with caplog.at_level(logging.INFO):
extension.spider_closed(spider, 'finished')
assert spider.from_date == datetime(2021, 5, 26, 0, 0)
assert [record.message for record in caplog.records] == [
f'Reading the {tmpdir}/test/20210525_000000 crawl directory with the empty prefix',
'Creating generator of compiled releases',
f'Writing the JSON data to the {tmpdir}/test/20210525_000000/data.csv CSV file',
'Replacing the JSON data in the test table',
]
assert [record.message for record in records] == [
("x: Multiple objects have the `id` value 'x' in the `parties` array"),
("z: Multiple objects have the `id` value 'z' in the `parties` array"),
]
@pytest.mark.skipif(SKIP_TEST_IF, reason='KINGFISHER_COLLECT_DATABASE_URL must be set')
@pytest.mark.parametrize('data,data_type,sample,compile_releases', [
(b'{"releases": [{"date": "2021-05-26T10:00:00Z"}]}', 'release_package', None, False),
(b'{"releases": [{"date": "2021-05-26T10:00:00Z"}]}', 'release_package', 1, False),
(b'{"releases": [{"ocid":"1", "date": "2021-05-26T10:00:00Z"}]}', 'release_package', None, True),
(b'{"records": [{"compiledRelease": {"date": "2021-05-26T10:00:00Z"}}]}', 'record_package', None, False),
(b'{"records": [{"releases": [{"ocid":"1", "date": "2021-05-26T10:00:00Z"}]}]}', 'record_package', None, True),
])
def test_spider_closed(cursor, caplog, tmpdir, data, data_type, sample, compile_releases):
spider = spider_with_crawler(crawl_time='2021-05-25T00:00:00',
settings={'DATABASE_URL': DATABASE_URL, 'FILES_STORE': tmpdir})
spider.data_type = data_type
spider.sample = sample
spider.compile_releases = compile_releases
extension = DatabaseStore.from_crawler(spider.crawler)
files_store_extension = FilesStore.from_crawler(spider.crawler)
response = Mock()
response.body = data
response.request = Mock()
response.request.url = 'https://example.com/remote.json'
response.request.meta = {'file_name': 'file.json'}
item = spider.build_file_from_response(response, file_name='file.json', data_type=data_type)
files_store_extension.item_scraped(item, spider)
extension.spider_opened(spider)
caplog.clear()
with caplog.at_level(logging.INFO):
extension.spider_closed(spider, 'finished')
cursor.execute("SELECT max(data->>'date') FROM test")
max_date = cursor.fetchone()[0]
assert max_date == '2021-05-26T10:00:00Z'
if compile_releases:
if data_type == 'release_package':
prefix = 'empty'
else:
prefix = 'records.item.releases.item'
elif data_type == 'release_package':
prefix = 'releases.item'
else:
prefix = 'records.item.compiledRelease'
if sample:
suffix = '_sample'
else:
suffix = ''
expected_messages = [
f'Reading the {tmpdir}/test{suffix}/20210525_000000 crawl directory with the {prefix} prefix',
f'Writing the JSON data to the {tmpdir}/test{suffix}/20210525_000000/data.csv CSV file',
'Replacing the JSON data in the test table',
]
if compile_releases:
expected_messages.insert(1, 'Creating generator of compiled releases')
assert [record.message for record in caplog.records] == expected_messages
@pytest.mark.skipif(SKIP_TEST_IF, reason='KINGFISHER_COLLECT_DATABASE_URL must be set')
def test_spider_closed_error(caplog, tmpdir):
spider = spider_with_crawler(crawl_time='2021-05-25T00:00:00',
settings={'DATABASE_URL': DATABASE_URL, 'FILES_STORE': tmpdir})
extension = DatabaseStore.from_crawler(spider.crawler)
with caplog.at_level(logging.INFO):
extension.spider_closed(spider, 'closed')
assert not caplog.records