Skip to content

Commit

Permalink
chore(database_store): Skip CSV serialization
Browse files Browse the repository at this point in the history
  • Loading branch information
jpmckinney committed May 16, 2024
1 parent 6480c7a commit cce300e
Show file tree
Hide file tree
Showing 2 changed files with 12 additions and 12 deletions.
20 changes: 10 additions & 10 deletions kingfisher_scrapy/extensions/database_store.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import csv
import os
import warnings
from datetime import datetime
Expand Down Expand Up @@ -119,17 +118,15 @@ def spider_closed(self, spider, reason):
convert_exceptions_to_warnings=True,
)

filename = os.path.join(crawl_directory, 'data.csv')
spider.logger.info('Writing the JSON data to the %s CSV file', filename)
filename = os.path.join(crawl_directory, 'data.jsonl')
spider.logger.info('Writing the JSON data to the %s JSONL file', filename)
count = 0
with open(filename, 'w') as f:
writer = csv.writer(f)

with warnings.catch_warnings(record=True) as wlist:
warnings.simplefilter("always", category=MergeErrorWarning)
warnings.simplefilter('always', category=MergeErrorWarning)

for item in data:
writer.writerow([util.json_dumps(item, ensure_ascii=False).replace(r'\u0000', '')])
f.write(util.json_dumps(item, ensure_ascii=False).replace(r'\u0000', '') + '\n')
count += 1

errors = []
Expand All @@ -149,9 +146,12 @@ def spider_closed(self, spider, reason):
self.execute('DROP TABLE {table}', table=table_name)
self.create_table(table_name)
with open(filename) as f:
self.cursor.copy_expert(self.format('COPY {table}(data) FROM STDIN WITH CSV', table=table_name), f)
self.execute("CREATE INDEX {index} ON {table}(cast(data->>'date' as text))", table=table_name,
index=f'idx_{table_name}')
self.cursor.copy_expert(self.format('COPY {table} (data) FROM stdin', table=table_name), f)
self.execute(
"CREATE INDEX {index} ON {table} (cast(data->>'date' as text))",
table=table_name,
index=f'idx_{table_name}',
)
self.connection.commit()
finally:
self.cursor.close()
Expand Down
4 changes: 2 additions & 2 deletions tests/extensions/test_database_store.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,7 +168,7 @@ def test_spider_closed_warnings(cursor, caplog, tmpdir):
assert [record.message for record in caplog.records] == [
f'Reading the {tmpdir}/test/20210525_000000 crawl directory with the empty prefix',
'Creating generator of compiled releases',
f'Writing the JSON data to the {tmpdir}/test/20210525_000000/data.csv CSV file',
f'Writing the JSON data to the {tmpdir}/test/20210525_000000/data.jsonl JSONL file',
'Replacing the JSON data in the test table (3 rows)',
]

Expand Down Expand Up @@ -245,7 +245,7 @@ def test_spider_closed(cursor, caplog, tmpdir, data, data_type, sample, compile_

expected_messages = [
f'Reading the {tmpdir}/test{suffix}/20210525_000000 crawl directory with the {prefix} prefix',
f'Writing the JSON data to the {tmpdir}/test{suffix}/20210525_000000/data.csv CSV file',
f'Writing the JSON data to the {tmpdir}/test{suffix}/20210525_000000/data.jsonl JSONL file',
f'Replacing the JSON data in the {expected_table} table (1 rows)',
]

Expand Down

0 comments on commit cce300e

Please sign in to comment.