This repository has been archived by the owner on Dec 2, 2021. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 5
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
13 changed files
with
609 additions
and
31 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
109 changes: 109 additions & 0 deletions
109
datapackage_pipelines_measure/pipeline_steps/outputs.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,109 @@ | ||
import os | ||
|
||
from datapackage_pipelines_measure.config import settings | ||
|
||
import logging | ||
log = logging.getLogger(__name__) | ||
|
||
DOWNLOADS_PATH = os.path.join(os.path.dirname(__file__), '../../downloads') | ||
|
||
label = 'outputs' | ||
|
||
|
||
def add_steps(steps: list, pipeline_id: str, | ||
project_id: str, config: dict) -> list: | ||
|
||
steps.append(('measure.datastore_get_latest', { | ||
'resource-name': 'latest-project-entries', | ||
'table': 'outputs', | ||
'engine': settings.get('DB_ENGINE'), | ||
'distinct_on': ['project_id', 'source', 'source_id'], | ||
'sort_date_key': 'source_timestamp' | ||
})) | ||
|
||
for source in config: | ||
steps.append(('measure.add_outputs_resource', { | ||
'sheet_id': source.get('sheetid'), | ||
'gid': source.get('gid'), | ||
'source_type': source.get('type') | ||
})) | ||
|
||
steps.append(('measure.remove_resource', { | ||
'name': 'latest-project-entries' | ||
})) | ||
|
||
steps.append(('concatenate', { | ||
'target': { | ||
'name': 'outputs', | ||
'path': 'data/outputs.csv'}, | ||
'fields': { | ||
'source_id': [], | ||
'source_type': [], | ||
'source': [], | ||
'source_timestamp': [], | ||
'source_email': [], | ||
'output_title': [], | ||
'output_type': [], | ||
'output_organization': [], | ||
'output_person': [], | ||
'output_link': [], | ||
'output_date': []} | ||
})) | ||
|
||
steps.append(('set_types', { | ||
'types': { | ||
'source_id': { | ||
'type': 'string' | ||
}, | ||
'source_type': { | ||
'type': 'string' | ||
}, | ||
'source': { | ||
'type': 'string' | ||
}, | ||
'source_timestamp': { | ||
'type': 'datetime' | ||
}, | ||
'source_email': { | ||
'type': 'string' | ||
}, | ||
'output_title': { | ||
'type': 'string' | ||
}, | ||
'output_organization': { | ||
'type': 'string' | ||
}, | ||
'output_person': { | ||
'type': 'string' | ||
}, | ||
'output_link': { | ||
'type': 'string' | ||
}, | ||
'output_date': { | ||
'type': 'date' | ||
}} | ||
})) | ||
|
||
steps.append(('measure.add_project_name', {'name': project_id})) | ||
steps.append(('measure.add_timestamp')) | ||
steps.append(('measure.add_uuid')) | ||
|
||
# Dump to path if in development mode | ||
if settings.get('DEVELOPMENT', False): | ||
steps.append(('dump.to_path', { | ||
'out-path': '{}/{}'.format(DOWNLOADS_PATH, pipeline_id) | ||
})) | ||
|
||
steps.append(('dump.to_sql', { | ||
'engine': settings.get('DB_ENGINE'), | ||
'tables': { | ||
'outputs': { | ||
'resource-name': 'outputs', | ||
'mode': 'update', | ||
'update_keys': ['project_id', 'source', 'source_timestamp', | ||
'source_id'] | ||
} | ||
} | ||
})) | ||
|
||
return steps |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
138 changes: 138 additions & 0 deletions
138
datapackage_pipelines_measure/processors/add_outputs_resource.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,138 @@ | ||
import re | ||
import json | ||
import dateutil | ||
import urllib | ||
|
||
from datapackage_pipelines.generators import slugify | ||
from datapackage_pipelines.wrapper import ingest, spew | ||
|
||
from datapackage_pipelines_measure.processors import google_utils | ||
|
||
import logging | ||
log = logging.getLogger(__name__) | ||
|
||
TIMESTAMP_COL = 'A' | ||
FAR_PAST_START_DATE = '1990-01-01' | ||
|
||
|
||
def _request_data_from_google_spreadsheet(start_date): | ||
''' | ||
Build a google charts query and append it to an authorised spreadsheets | ||
request, returning the response. | ||
''' | ||
def _build_charts_query(start_date): | ||
''' | ||
Build and return a charts query to fetch the most recent rows from the | ||
spreadsheet, based on the most recent data collected for this source. | ||
''' | ||
query = ''' | ||
SELECT * | ||
WHERE {timestamp} > date '{start_date}' | ||
ORDER BY {timestamp} | ||
'''.format(timestamp=TIMESTAMP_COL, start_date=start_date) | ||
query = query.strip() | ||
return urllib.parse.quote(query) | ||
|
||
def _parse_response_to_dict(response): | ||
'''Parse the response from google api and return the bit we want as a | ||
native dict''' | ||
regexp = re.compile(b"(^\/\*O_o\*\/\\ngoogle\.visualization\.Query\.setResponse\(|\);$)") # noqa | ||
return json.loads(re.sub(regexp, b'', response[1]).decode()) | ||
|
||
base_request_url = 'https://docs.google.com/spreadsheets/d/{}/gviz/tq?gid={}&headers=1&tq={}' # noqa | ||
request_url = base_request_url.format(sheet_id, gid, | ||
_build_charts_query(start_date)) | ||
|
||
authed_http = google_utils.get_authorized_http_object( | ||
google_utils.GOOGLE_API_DRIVE_SCOPES) | ||
|
||
raw_response = authed_http.request(request_url) | ||
|
||
response = _parse_response_to_dict(raw_response) | ||
if response.get('status') == 'error': | ||
raise ValueError('The following error was returned:\n{}' | ||
.format(response['errors'][0].get('detailed_message'))) # noqa | ||
return response | ||
|
||
|
||
def form_collector(source_id, source_type, latest_date): | ||
start_date = FAR_PAST_START_DATE | ||
if latest_date: | ||
start_date = latest_date.date() | ||
|
||
response = _request_data_from_google_spreadsheet(start_date) | ||
|
||
resource_content = [] | ||
headers = response['table']['cols'] | ||
headers = [slugify(h['label'].lower()) for h in headers] | ||
for r in response['table']['rows']: | ||
row = r['c'] | ||
row_dict = {} | ||
for i, v in enumerate(row): | ||
if v is not None: | ||
row_dict[headers[i]] = v.get('f') or v.get('v') | ||
else: | ||
row_dict[headers[i]] = None | ||
output_date = dateutil.parser.parse(row_dict.get('date')).date() \ | ||
if row_dict.get('date') is not None else None | ||
res_row = { | ||
'source_id': source_id, | ||
'source_type': source_type, | ||
'source': 'gsheets', | ||
'source_timestamp': | ||
dateutil.parser.parse(row_dict.get('timestamp')), | ||
'source_email': row_dict.get('email-address'), | ||
'output_title': row_dict.get('title'), | ||
'output_type': row_dict.get('type-of-output'), | ||
'output_organization': row_dict.get('for-what-organisation'), | ||
'output_person': row_dict.get('who-did-this'), | ||
'output_link': row_dict.get('link-if-published'), | ||
'output_date': output_date | ||
} | ||
resource_content.append(res_row) | ||
|
||
return resource_content | ||
|
||
|
||
def process_resources(res_iter, datapackage, source_id, source_type): | ||
|
||
def get_latest_date(first): | ||
latest_date = None | ||
my_rows = [] | ||
for row in first: | ||
if row['source_id'] == source_id and row['source'] == 'gsheets': | ||
latest_date = row['source_timestamp'] | ||
my_rows.append(row) | ||
return latest_date, iter(my_rows) | ||
|
||
if len(datapackage['resources']): | ||
if datapackage['resources'][0]['name'] == 'latest-project-entries': | ||
latest_date, latest_iter = get_latest_date(next(res_iter)) | ||
yield latest_iter | ||
else: | ||
latest_date = None | ||
yield from res_iter | ||
yield form_collector(source_id, source_type, latest_date) | ||
|
||
|
||
parameters, datapackage, res_iter = ingest() | ||
|
||
sheet_id = parameters['sheet_id'] | ||
gid = parameters['gid'] | ||
source_type = parameters['source_type'] | ||
source_id = '{0}/{1}'.format(sheet_id, gid) | ||
resource = { | ||
'name': slugify(sheet_id).lower(), | ||
'path': 'data/{}.csv'.format(slugify(sheet_id)) | ||
} | ||
|
||
headers = ['source', 'source_type', 'source_timestamp', 'source_email', | ||
'output_title', 'output_type', 'output_organization', | ||
'output_person', 'output_link', 'output_date'] | ||
resource['schema'] = {'fields': [{'name': h, 'type': 'string'} | ||
for h in headers]} | ||
|
||
datapackage['resources'].append(resource) | ||
|
||
spew(datapackage, process_resources(res_iter, datapackage, | ||
source_id, source_type)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.