/
loaddatasets.py
109 lines (95 loc) 路 3.48 KB
/
loaddatasets.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
import csv
import lzma
from tempfile import NamedTemporaryFile
from urllib.request import urlretrieve
from django.conf import settings
from django.core.management.base import BaseCommand
from jarbas.core.models import Document
class Command(BaseCommand):
help = 'Load Serenata de Amor datasets into the database'
created = 0
updated = 0
def parse(self):
"""Load datasets and return a dict similar to model Document"""
suffixes = ('current-year', 'last-year', 'previous-years')
for url in map(self.get_url, suffixes):
print("Loading " + url)
with NamedTemporaryFile() as tmp:
urlretrieve(url, filename=tmp.name)
with lzma.open(tmp.name, mode='rt') as file_handler:
for row in csv.DictReader(file_handler):
if not self.has_reached_the_limit():
yield row
else:
break
if self.has_reached_the_limit():
break
def handle(self, *args, **options):
"""Create or update records (if they match `document_id`)"""
for document in map(self.serialize, self.parse()):
obj, created = Document.objects.update_or_create(
document_id=document['document_id'],
year=document['year'],
month=document['month'],
applicant_id=document['applicant_id'],
congressperson_id=document['congressperson_id'],
defaults=document
)
if created:
self.created += 1
else:
self.updated += 1
msg = "{:,} records created / {:,} records updated "
print(msg.format(self.created, self.updated), end="\r")
def serialize(self, document):
"""Read the dict generated by DictReader and fix content types"""
integers = (
'document_id',
'congressperson_id',
'congressperson_document',
'term',
'term_id',
'subquota_number',
'subquota_group_id',
'document_type',
'month',
'year',
'installment',
'batch_number',
'reimbursement_number',
'applicant_id'
)
for key in integers:
document[key] = self.to_number(document[key], int)
floats = (
'document_value',
'remark_value',
'net_value',
'reimbursement_value'
)
for key in floats:
document[key] = self.to_number(document[key], float)
if document['issue_date'] == '':
document['issue_date'] = None
return document
def has_reached_the_limit(self):
limit = settings.DATABASE_LIMIT
total = self.created + self.updated
if limit and total >= limit:
return True
return False
@staticmethod
def get_url(suffix):
file_name = '{date}-{suffix}.xz'.format(
date=settings.AMAZON_S3_DATASET_DATE,
suffix=suffix
)
url = 'https://{region}.amazonaws.com/{bucket}/{file_name}'.format(
region=settings.AMAZON_S3_REGION,
bucket=settings.AMAZON_S3_BUCKET,
file_name=file_name
)
return url
@staticmethod
def to_number(value, type_of_number):
return 0 if value in ('NaN', '') else type_of_number(value)