diff --git a/kingfisher_scrapy/extensions/database_store.py b/kingfisher_scrapy/extensions/database_store.py index 392491f0..3dad37c3 100644 --- a/kingfisher_scrapy/extensions/database_store.py +++ b/kingfisher_scrapy/extensions/database_store.py @@ -1,4 +1,3 @@ -import csv import os import warnings from datetime import datetime @@ -119,17 +118,15 @@ def spider_closed(self, spider, reason): convert_exceptions_to_warnings=True, ) - filename = os.path.join(crawl_directory, 'data.csv') - spider.logger.info('Writing the JSON data to the %s CSV file', filename) + filename = os.path.join(crawl_directory, 'data.jsonl') + spider.logger.info('Writing the JSON data to the %s JSONL file', filename) count = 0 with open(filename, 'w') as f: - writer = csv.writer(f) - with warnings.catch_warnings(record=True) as wlist: - warnings.simplefilter("always", category=MergeErrorWarning) + warnings.simplefilter('always', category=MergeErrorWarning) for item in data: - writer.writerow([util.json_dumps(item, ensure_ascii=False).replace(r'\u0000', '')]) + f.write(util.json_dumps(item, ensure_ascii=False).replace(r'\u0000', '') + '\n') count += 1 errors = [] @@ -149,9 +146,12 @@ def spider_closed(self, spider, reason): self.execute('DROP TABLE {table}', table=table_name) self.create_table(table_name) with open(filename) as f: - self.cursor.copy_expert(self.format('COPY {table}(data) FROM STDIN WITH CSV', table=table_name), f) - self.execute("CREATE INDEX {index} ON {table}(cast(data->>'date' as text))", table=table_name, - index=f'idx_{table_name}') + self.cursor.copy_expert(self.format('COPY {table} (data) FROM stdin', table=table_name), f) + self.execute( + "CREATE INDEX {index} ON {table} (cast(data->>'date' as text))", + table=table_name, + index=f'idx_{table_name}', + ) self.connection.commit() finally: self.cursor.close() diff --git a/tests/extensions/test_database_store.py b/tests/extensions/test_database_store.py index da6fc895..d7f09c01 100644 --- a/tests/extensions/test_database_store.py +++ b/tests/extensions/test_database_store.py @@ -168,7 +168,7 @@ def test_spider_closed_warnings(cursor, caplog, tmpdir): assert [record.message for record in caplog.records] == [ f'Reading the {tmpdir}/test/20210525_000000 crawl directory with the empty prefix', 'Creating generator of compiled releases', - f'Writing the JSON data to the {tmpdir}/test/20210525_000000/data.csv CSV file', + f'Writing the JSON data to the {tmpdir}/test/20210525_000000/data.jsonl JSONL file', 'Replacing the JSON data in the test table (3 rows)', ] @@ -245,7 +245,7 @@ def test_spider_closed(cursor, caplog, tmpdir, data, data_type, sample, compile_ expected_messages = [ f'Reading the {tmpdir}/test{suffix}/20210525_000000 crawl directory with the {prefix} prefix', - f'Writing the JSON data to the {tmpdir}/test{suffix}/20210525_000000/data.csv CSV file', + f'Writing the JSON data to the {tmpdir}/test{suffix}/20210525_000000/data.jsonl JSONL file', f'Replacing the JSON data in the {expected_table} table (1 rows)', ]