Skip to content
This repository has been archived by the owner on Dec 2, 2021. It is now read-only.

Commit

Permalink
Merge e7c98a2 into 92ade9c
Browse files Browse the repository at this point in the history
  • Loading branch information
brew committed Jun 23, 2017
2 parents 92ade9c + e7c98a2 commit b696b60
Show file tree
Hide file tree
Showing 13 changed files with 609 additions and 31 deletions.
34 changes: 33 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -261,6 +261,38 @@ The Google Analytics processor requires a Google API account with the **Google A
1. Give Measure credentials to the websites' analytics you'd like to track:
- Add the service account email to the list of users that has read permissions in the given analytics' accounts

### Outputs

'Outputs' refers to secondary events and products related to a project, e.g. blog posts, talks given, or tangible uses of our products. These can be either internally produced, or external.

We capture these outputs manually using Google Forms, which writes the results to a Google Spreadsheet.

#### Outputs Captured by Google Forms

The Outputs processor requires a Google API account with generated credentials to read private Google Spreadsheet URLs.

1. Make a copy of the Outputs Form template for your project (https://docs.google.com/a/okfn.org/forms/d/e/1FAIpQLSfQuBlwZMnWhGjCv4teAMdsKQ3pgbAi08ZwKBtZLAQFw7LqDg/viewform)
2. Configure the associated spreadsheet destination where captured data will be written to. This can be found within the 'Responses' tab for the
3. form, within the settings dropdown > 'Select response destination'
3. Go to the form's spreadsheet and make a note of the `sheetid` and `gid`, which are part of the spreadsheet URL:
```https://docs.google.com/spreadsheets/d/{sheetid}/edit#gid={gid}```
4. Ensure the spreadsheet can be read by the Google API service account that is being used to authorise requests. Either by making the spreadsheet public, or by sharing with the email associated with the service account (defined in the generated credentials)
5. Configure the Measure project with an entry for the Outputs processor:

```yaml
# sheetid and gid correspond with the parts of the spreadsheet url:
# https://docs.google.com/spreadsheets/d/{sheetid}/edit#gid={gid}

config:
outputs:
- sheetid: "{sheetid from above}"
gid: "{gid from above}"
type: "external" # the type of outputs captured here
- sheetid: "{another sheetid}"
gid: "{another gid}"
type: "internal"
```

## Environmental Variables

Each installation of Measure requires certain environmental variables to be set.
Expand All @@ -283,7 +315,7 @@ Each installation of Measure requires certain environmental variables to be set.
### Facebook
- `MEASURE_FACEBOOK_API_ACCESS_TOKEN_{PAGE NAME IN UPPERCASE}`: The page access token obtained from [How to get a Facebook Page Access Token](#how-to-get-a-facebook-page-access-token).

### PyPI & Google analytics
### Google credentials for PyPI, Google analytics, and Outputs
See the [PyPI Big Query API](#pypi-configuration) instructions above to get the values for these env vars:
- `MEASURE_GOOGLE_API_PROJECT_ID`: {project_id}
- `MEASURE_GOOGLE_API_JWT_AUTH_PROVIDER_X509_CERT_URL`: {auth_provider_x509_cert_url}
Expand Down
3 changes: 2 additions & 1 deletion datapackage_pipelines_measure/pipeline_steps/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,9 @@
code_hosting,
code_packaging,
website_analytics,
outputs,
example
)

__all__ = ['social_media', 'code_hosting', 'code_packaging',
'website_analytics', 'example']
'website_analytics', 'outputs', 'example']
Original file line number Diff line number Diff line change
Expand Up @@ -22,15 +22,13 @@ def add_steps(steps: list, pipeline_id: str,
if 'npm' in config:
for package in config['npm']['packages']:
steps.append(('measure.add_npm_resource', {
'package': slugify(package),
'project_id': project_id
'package': slugify(package)
}))

if 'pypi' in config:
for package in config['pypi']['packages']:
steps.append(('measure.add_pypi_resource', {
'package': slugify(package),
'project_id': project_id
'package': slugify(package)
}))

steps.append(('measure.remove_resource', {
Expand Down
109 changes: 109 additions & 0 deletions datapackage_pipelines_measure/pipeline_steps/outputs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
import os

from datapackage_pipelines_measure.config import settings

import logging
log = logging.getLogger(__name__)

DOWNLOADS_PATH = os.path.join(os.path.dirname(__file__), '../../downloads')

label = 'outputs'


def add_steps(steps: list, pipeline_id: str,
project_id: str, config: dict) -> list:

steps.append(('measure.datastore_get_latest', {
'resource-name': 'latest-project-entries',
'table': 'outputs',
'engine': settings.get('DB_ENGINE'),
'distinct_on': ['project_id', 'source', 'source_id'],
'sort_date_key': 'source_timestamp'
}))

for source in config:
steps.append(('measure.add_outputs_resource', {
'sheet_id': source.get('sheetid'),
'gid': source.get('gid'),
'source_type': source.get('type')
}))

steps.append(('measure.remove_resource', {
'name': 'latest-project-entries'
}))

steps.append(('concatenate', {
'target': {
'name': 'outputs',
'path': 'data/outputs.csv'},
'fields': {
'source_id': [],
'source_type': [],
'source': [],
'source_timestamp': [],
'source_email': [],
'output_title': [],
'output_type': [],
'output_organization': [],
'output_person': [],
'output_link': [],
'output_date': []}
}))

steps.append(('set_types', {
'types': {
'source_id': {
'type': 'string'
},
'source_type': {
'type': 'string'
},
'source': {
'type': 'string'
},
'source_timestamp': {
'type': 'datetime'
},
'source_email': {
'type': 'string'
},
'output_title': {
'type': 'string'
},
'output_organization': {
'type': 'string'
},
'output_person': {
'type': 'string'
},
'output_link': {
'type': 'string'
},
'output_date': {
'type': 'date'
}}
}))

steps.append(('measure.add_project_name', {'name': project_id}))
steps.append(('measure.add_timestamp'))
steps.append(('measure.add_uuid'))

# Dump to path if in development mode
if settings.get('DEVELOPMENT', False):
steps.append(('dump.to_path', {
'out-path': '{}/{}'.format(DOWNLOADS_PATH, pipeline_id)
}))

steps.append(('dump.to_sql', {
'engine': settings.get('DB_ENGINE'),
'tables': {
'outputs': {
'resource-name': 'outputs',
'mode': 'update',
'update_keys': ['project_id', 'source', 'source_timestamp',
'source_id']
}
}
}))

return steps
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,6 @@ def npm_collector(package, latest_date):
parameters, datapackage, res_iter = ingest()

package = parameters['package']
project_id = parameters['project_id']
resource = {
'name': slugify(package),
'path': 'data/{}.csv'.format(slugify(package))
Expand Down
138 changes: 138 additions & 0 deletions datapackage_pipelines_measure/processors/add_outputs_resource.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
import re
import json
import dateutil
import urllib

from datapackage_pipelines.generators import slugify
from datapackage_pipelines.wrapper import ingest, spew

from datapackage_pipelines_measure.processors import google_utils

import logging
log = logging.getLogger(__name__)

TIMESTAMP_COL = 'A'
FAR_PAST_START_DATE = '1990-01-01'


def _request_data_from_google_spreadsheet(start_date):
'''
Build a google charts query and append it to an authorised spreadsheets
request, returning the response.
'''
def _build_charts_query(start_date):
'''
Build and return a charts query to fetch the most recent rows from the
spreadsheet, based on the most recent data collected for this source.
'''
query = '''
SELECT *
WHERE {timestamp} > date '{start_date}'
ORDER BY {timestamp}
'''.format(timestamp=TIMESTAMP_COL, start_date=start_date)
query = query.strip()
return urllib.parse.quote(query)

def _parse_response_to_dict(response):
'''Parse the response from google api and return the bit we want as a
native dict'''
regexp = re.compile(b"(^\/\*O_o\*\/\\ngoogle\.visualization\.Query\.setResponse\(|\);$)") # noqa
return json.loads(re.sub(regexp, b'', response[1]).decode())

base_request_url = 'https://docs.google.com/spreadsheets/d/{}/gviz/tq?gid={}&headers=1&tq={}' # noqa
request_url = base_request_url.format(sheet_id, gid,
_build_charts_query(start_date))

authed_http = google_utils.get_authorized_http_object(
google_utils.GOOGLE_API_DRIVE_SCOPES)

raw_response = authed_http.request(request_url)

response = _parse_response_to_dict(raw_response)
if response.get('status') == 'error':
raise ValueError('The following error was returned:\n{}'
.format(response['errors'][0].get('detailed_message'))) # noqa
return response


def form_collector(source_id, source_type, latest_date):
start_date = FAR_PAST_START_DATE
if latest_date:
start_date = latest_date.date()

response = _request_data_from_google_spreadsheet(start_date)

resource_content = []
headers = response['table']['cols']
headers = [slugify(h['label'].lower()) for h in headers]
for r in response['table']['rows']:
row = r['c']
row_dict = {}
for i, v in enumerate(row):
if v is not None:
row_dict[headers[i]] = v.get('f') or v.get('v')
else:
row_dict[headers[i]] = None
output_date = dateutil.parser.parse(row_dict.get('date')).date() \
if row_dict.get('date') is not None else None
res_row = {
'source_id': source_id,
'source_type': source_type,
'source': 'gsheets',
'source_timestamp':
dateutil.parser.parse(row_dict.get('timestamp')),
'source_email': row_dict.get('email-address'),
'output_title': row_dict.get('title'),
'output_type': row_dict.get('type-of-output'),
'output_organization': row_dict.get('for-what-organisation'),
'output_person': row_dict.get('who-did-this'),
'output_link': row_dict.get('link-if-published'),
'output_date': output_date
}
resource_content.append(res_row)

return resource_content


def process_resources(res_iter, datapackage, source_id, source_type):

def get_latest_date(first):
latest_date = None
my_rows = []
for row in first:
if row['source_id'] == source_id and row['source'] == 'gsheets':
latest_date = row['source_timestamp']
my_rows.append(row)
return latest_date, iter(my_rows)

if len(datapackage['resources']):
if datapackage['resources'][0]['name'] == 'latest-project-entries':
latest_date, latest_iter = get_latest_date(next(res_iter))
yield latest_iter
else:
latest_date = None
yield from res_iter
yield form_collector(source_id, source_type, latest_date)


parameters, datapackage, res_iter = ingest()

sheet_id = parameters['sheet_id']
gid = parameters['gid']
source_type = parameters['source_type']
source_id = '{0}/{1}'.format(sheet_id, gid)
resource = {
'name': slugify(sheet_id).lower(),
'path': 'data/{}.csv'.format(slugify(sheet_id))
}

headers = ['source', 'source_type', 'source_timestamp', 'source_email',
'output_title', 'output_type', 'output_organization',
'output_person', 'output_link', 'output_date']
resource['schema'] = {'fields': [{'name': h, 'type': 'string'}
for h in headers]}

datapackage['resources'].append(resource)

spew(datapackage, process_resources(res_iter, datapackage,
source_id, source_type))
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,6 @@ def pypi_collector(package, latest_date):
parameters, datapackage, res_iter = ingest()

package = parameters['package']
project_id = parameters['project_id']
resource = {
'name': slugify(package),
'path': 'data/{}.csv'.format(slugify(package))
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
engine = parameters['engine']
resource_name = parameters['resource-name']
distinct_on = ', '.join(parameters['distinct_on'])
sort_date_key = parameters.get('sort_date_key', 'date')

Base = automap_base()
engine = create_engine(engine)
Expand All @@ -33,7 +34,8 @@
s = text(
"SELECT DISTINCT ON ({0}) *"
"FROM {1} "
"ORDER BY {0}, date DESC".format(distinct_on, table)
"ORDER BY {0}, {2} DESC"
.format(distinct_on, table, sort_date_key)
)

results = session.query(Table).from_statement(s).all()
Expand Down

0 comments on commit b696b60

Please sign in to comment.