-
Notifications
You must be signed in to change notification settings - Fork 49
/
scraperwiki.py
executable file
·88 lines (67 loc) · 2.85 KB
/
scraperwiki.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
#!/usr/bin/env python
"""
Example showing how to import data from the Scraperwiki API.
"""
import json
import re
import requests
PANDA_API = 'http://localhost:8000/api/1.0'
PANDA_AUTH_PARAMS = {
'email': 'panda@pandaproject.net',
'api_key': 'edfe6c5ffd1be4d3bf22f69188ac6bc0fc04c84b'
}
PANDA_DATASET_SLUG = 'smith-county-criminal-cases'
PANDA_DATASET_URL = '%s/dataset/%s/' % (PANDA_API, PANDA_DATASET_SLUG)
PANDA_DATA_URL = '%s/dataset/%s/data/' % (PANDA_API, PANDA_DATASET_SLUG)
PANDA_BULK_UPDATE_SIZE = 1000
SCRAPERWIKI_URL = 'https://api.scraperwiki.com/api/1.0/datastore/sqlite?format=jsondict&name=tyler_criminal_records&query=select%20*%20from%20%60swdata%60'
COLUMNS = ['cause_number', 'date_filed', 'defendant_name', 'defendant_birthdate', 'offense', 'crime_date', 'degree', 'disposed', 'court', 'warrant_status', 'attorney', 'view_url']
COLUMN_TYPES = ['', 'date', '', '', '', '', '', '', '', '', '', '']
# Utility functions
def panda_get(url, params={}):
params.update(PANDA_AUTH_PARAMS)
return requests.get(url, params=params)
def panda_put(url, data, params={}):
params.update(PANDA_AUTH_PARAMS)
return requests.put(url, data, params=params, headers={ 'Content-Type': 'application/json' })
def slugify(value):
"""
Graciously borrowed from Django core.
"""
import unicodedata
value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore')
value = unicode(re.sub('[^\w\s-]', '', value).strip().lower())
return re.sub('[-\s]+', '-', value)
# Check if dataset exists
response = panda_get(PANDA_DATASET_URL)
# Create dataset if necessary
if response.status_code == 404:
dataset = {
'name': 'Scraperwiki: Smith County Criminal Case Records',
'description': 'Results of the scraper at <a href="https://scraperwiki.com/scrapers/tyler_criminal_records/">https://scraperwiki.com/scrapers/tyler_criminal_records/</a>.'
}
response = panda_put(PANDA_DATASET_URL, json.dumps(dataset), params={
'columns': ','.join(COLUMNS),
'typed_columns': ','.join(['true' if t else '' for t in COLUMN_TYPES]),
'column_types': ','.join(COLUMN_TYPES)
})
# Fetch latest data from Scraperwiki
print 'Fetching latest data'
response = requests.get(SCRAPERWIKI_URL)
data = json.loads(response.content)
put_data = {
'objects': []
}
for i, row in enumerate(data):
put_data['objects'].append({
'data': [row[c] for c in COLUMNS],
'external_id': slugify(row['cause_number']) # Slugify because a few have errants commas and such
})
if i and i % PANDA_BULK_UPDATE_SIZE == 0:
print 'Updating %i rows...' % PANDA_BULK_UPDATE_SIZE
panda_put(PANDA_DATA_URL, json.dumps(put_data))
put_data['objects'] = []
if put_data['objects']:
print 'Updating %i rows' % len(put_data['objects'])
panda_put(PANDA_DATA_URL, json.dumps(put_data))
print 'Done'