-
Notifications
You must be signed in to change notification settings - Fork 12
/
test_incrementaldatastore.py
90 lines (63 loc) · 3.13 KB
/
test_incrementaldatastore.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
import json
import os
from unittest.mock import patch
import psycopg2
import pytest
from scrapy.cmdline import execute
from scrapy.utils.project import get_project_settings
from kingfisher_scrapy.commands.incrementaldatastore import IncrementalDataStore
# tests/extensions/test_kingfisher_process_api.py fails if execute() is already called.
@pytest.mark.order(-1)
def test_command_without_arguments(capsys):
with pytest.raises(SystemExit):
execute(['scrapy', 'incrementaldatastore'])
actual = capsys.readouterr().err.rsplit("\n", 2)[-2]
assert actual == "pytest: error: The spider, database-schema and crawl-time arguments must be set."
@pytest.mark.order(-1)
def test_invalid_spider(capsys):
with pytest.raises(SystemExit):
execute(['scrapy', 'incrementaldatastore', 'nonexistent', 'test', 'test'])
actual = capsys.readouterr().err.rsplit("\n", 2)[-2]
assert actual == "pytest: error: The spider argument 'nonexistent' is not a known spider."
@pytest.mark.order(-1)
def test_invalid_crawl_time(capsys):
with pytest.raises(SystemExit):
execute(['scrapy', 'incrementaldatastore', 'fail', 'test', 'test'])
actual = capsys.readouterr().err.rsplit("\n", 2)[-2]
assert actual == "pytest: error: The crawl-time argument 'test' must be in YYYY-MM-DDTHH:MM:SS format: time " \
"data 'test' does not match format '%Y-%m-%dT%H:%M:%S'"
def test_format_date():
command = IncrementalDataStore()
assert command.format_from_date('2020-01-01T00:00:00', 'datetime') == '2020-01-01T00:00:00'
assert command.format_from_date('2020-01-01T00:00:00', 'date') == '2020-01-01'
assert command.format_from_date('2020-01-01T00:00:00', 'year-month') == '2020-01'
assert command.format_from_date('2020-01-01T00:00:00', 'year') == '2020'
@patch('scrapy.crawler.CrawlerProcess.crawl')
@pytest.mark.order(-1)
def test_command(crawl, caplog, tmp_path):
data_directory = tmp_path / 'fail' / '20200101_000000'
data_directory.mkdir(parents=True)
with (data_directory / 'data.json').open('w') as f:
json.dump({'releases': [{'date': '2020-05-13T00:00:00Z'}]}, f)
connection = psycopg2.connect(os.getenv('KINGFISHER_COLLECT_DATABASE_URL'))
cursor = connection.cursor()
try:
settings = get_project_settings()
settings['FILES_STORE'] = str(tmp_path)
with pytest.raises(SystemExit):
execute(['scrapy', 'incrementaldatastore', 'fail', 'public', '2020-01-01T00:00:00'], settings=settings)
cursor.execute("SELECT max(data->>'date') FROM fail")
max_date = cursor.fetchone()[0]
cursor.execute('DROP TABLE fail')
connection.commit()
assert max_date == '2020-05-13T00:00:00Z'
assert [record.message for record in caplog.records][-5:] == [
'Getting the date from which to resume the crawl (if any)',
'Running: scrapy crawl fail -a crawl_time=2020-01-01T00:00:00',
'Reading the crawl directory',
'Writing the JSON data to a CSV file',
'Replacing the JSON data in the SQL table',
]
finally:
cursor.close()
connection.close()