Merge e7c98a2 into 92ade9c

okfn · Jun 23, 2017 · b696b60 · b696b60
2 parents 92ade9c + e7c98a2
commit b696b60
Show file tree

Hide file tree

Showing 13 changed files with 609 additions and 31 deletions.
diff --git a/README.md b/README.md
@@ -261,6 +261,38 @@ The Google Analytics processor requires a Google API account with the **Google A
 1. Give Measure credentials to the websites' analytics you'd like to track:
     - Add the service account email to the list of users that has read permissions in the given analytics' accounts
 
+### Outputs
+
+'Outputs' refers to secondary events and products related to a project, e.g. blog posts, talks given, or tangible uses of our products. These can be either internally produced, or external.
+
+We capture these outputs manually using Google Forms, which writes the results to a Google Spreadsheet.
+
+#### Outputs Captured by Google Forms
+
+The Outputs processor requires a Google API account with generated credentials to read private Google Spreadsheet URLs.
+
+1. Make a copy of the Outputs Form template for your project (https://docs.google.com/a/okfn.org/forms/d/e/1FAIpQLSfQuBlwZMnWhGjCv4teAMdsKQ3pgbAi08ZwKBtZLAQFw7LqDg/viewform)
+2. Configure the associated spreadsheet destination where captured data will be written to. This can be found within the 'Responses' tab for the
+3.  form, within the settings dropdown > 'Select response destination'
+3. Go to the form's spreadsheet and make a note of the `sheetid` and `gid`, which are part of the spreadsheet URL:
+```https://docs.google.com/spreadsheets/d/{sheetid}/edit#gid={gid}```
+4. Ensure the spreadsheet can be read by the Google API service account that is being used to authorise requests. Either by making the spreadsheet public, or by sharing with the email associated with the service account (defined in the generated credentials)
+5. Configure the Measure project with an entry for the Outputs processor:
+
+```yaml
+# sheetid and gid correspond with the parts of the spreadsheet url:
+# https://docs.google.com/spreadsheets/d/{sheetid}/edit#gid={gid}
+
+config:
+  outputs:
+    - sheetid: "{sheetid from above}"
+      gid: "{gid from above}"
+      type: "external"  # the type of outputs captured here
+    - sheetid: "{another sheetid}"
+      gid: "{another gid}"
+      type: "internal"
+```
+
 ## Environmental Variables
 
 Each installation of Measure requires certain environmental variables to be set.
@@ -283,7 +315,7 @@ Each installation of Measure requires certain environmental variables to be set.
 ### Facebook
 - `MEASURE_FACEBOOK_API_ACCESS_TOKEN_{PAGE NAME IN UPPERCASE}`: The page access token obtained from [How to get a Facebook Page Access Token](#how-to-get-a-facebook-page-access-token).
 
-### PyPI & Google analytics
+### Google credentials for PyPI, Google analytics, and Outputs
 See the [PyPI Big Query API](#pypi-configuration) instructions above to get the values for these env vars:
 - `MEASURE_GOOGLE_API_PROJECT_ID`: {project_id}
 - `MEASURE_GOOGLE_API_JWT_AUTH_PROVIDER_X509_CERT_URL`: {auth_provider_x509_cert_url}

diff --git a/datapackage_pipelines_measure/pipeline_steps/__init__.py b/datapackage_pipelines_measure/pipeline_steps/__init__.py
@@ -3,8 +3,9 @@
     code_hosting,
     code_packaging,
     website_analytics,
+    outputs,
     example
 )
 
 __all__ = ['social_media', 'code_hosting', 'code_packaging',
-           'website_analytics', 'example']
+           'website_analytics', 'outputs', 'example']
diff --git a/datapackage_pipelines_measure/pipeline_steps/code_packaging.py b/datapackage_pipelines_measure/pipeline_steps/code_packaging.py
@@ -22,15 +22,13 @@ def add_steps(steps: list, pipeline_id: str,
     if 'npm' in config:
         for package in config['npm']['packages']:
             steps.append(('measure.add_npm_resource', {
-                'package': slugify(package),
-                'project_id': project_id
+                'package': slugify(package)
             }))
 
     if 'pypi' in config:
         for package in config['pypi']['packages']:
             steps.append(('measure.add_pypi_resource', {
-                'package': slugify(package),
-                'project_id': project_id
+                'package': slugify(package)
             }))
 
     steps.append(('measure.remove_resource', {

diff --git a/datapackage_pipelines_measure/pipeline_steps/outputs.py b/datapackage_pipelines_measure/pipeline_steps/outputs.py
@@ -0,0 +1,109 @@
+import os
+
+from datapackage_pipelines_measure.config import settings
+
+import logging
+log = logging.getLogger(__name__)
+
+DOWNLOADS_PATH = os.path.join(os.path.dirname(__file__), '../../downloads')
+
+label = 'outputs'
+
+
+def add_steps(steps: list, pipeline_id: str,
+              project_id: str, config: dict) -> list:
+
+    steps.append(('measure.datastore_get_latest', {
+        'resource-name': 'latest-project-entries',
+        'table': 'outputs',
+        'engine': settings.get('DB_ENGINE'),
+        'distinct_on': ['project_id', 'source', 'source_id'],
+        'sort_date_key': 'source_timestamp'
+    }))
+
+    for source in config:
+        steps.append(('measure.add_outputs_resource', {
+            'sheet_id': source.get('sheetid'),
+            'gid': source.get('gid'),
+            'source_type': source.get('type')
+        }))
+
+    steps.append(('measure.remove_resource', {
+        'name': 'latest-project-entries'
+    }))
+
+    steps.append(('concatenate', {
+        'target': {
+            'name': 'outputs',
+            'path': 'data/outputs.csv'},
+        'fields': {
+            'source_id': [],
+            'source_type': [],
+            'source': [],
+            'source_timestamp': [],
+            'source_email': [],
+            'output_title': [],
+            'output_type': [],
+            'output_organization': [],
+            'output_person': [],
+            'output_link': [],
+            'output_date': []}
+    }))
+
+    steps.append(('set_types', {
+        'types': {
+            'source_id': {
+                'type': 'string'
+            },
+            'source_type': {
+                'type': 'string'
+            },
+            'source': {
+                'type': 'string'
+            },
+            'source_timestamp': {
+                'type': 'datetime'
+            },
+            'source_email': {
+                'type': 'string'
+            },
+            'output_title': {
+                'type': 'string'
+            },
+            'output_organization': {
+                'type': 'string'
+            },
+            'output_person': {
+                'type': 'string'
+            },
+            'output_link': {
+                'type': 'string'
+            },
+            'output_date': {
+                'type': 'date'
+            }}
+    }))
+
+    steps.append(('measure.add_project_name', {'name': project_id}))
+    steps.append(('measure.add_timestamp'))
+    steps.append(('measure.add_uuid'))
+
+    # Dump to path if in development mode
+    if settings.get('DEVELOPMENT', False):
+        steps.append(('dump.to_path', {
+            'out-path': '{}/{}'.format(DOWNLOADS_PATH, pipeline_id)
+        }))
+
+    steps.append(('dump.to_sql', {
+        'engine': settings.get('DB_ENGINE'),
+        'tables': {
+            'outputs': {
+                'resource-name': 'outputs',
+                'mode': 'update',
+                'update_keys': ['project_id', 'source', 'source_timestamp',
+                                'source_id']
+            }
+        }
+    }))
+
+    return steps
diff --git a/datapackage_pipelines_measure/processors/add_npm_resource.py b/datapackage_pipelines_measure/processors/add_npm_resource.py
@@ -136,7 +136,6 @@ def npm_collector(package, latest_date):
 parameters, datapackage, res_iter = ingest()
 
 package = parameters['package']
-project_id = parameters['project_id']
 resource = {
     'name': slugify(package),
     'path': 'data/{}.csv'.format(slugify(package))

diff --git a/datapackage_pipelines_measure/processors/add_outputs_resource.py b/datapackage_pipelines_measure/processors/add_outputs_resource.py
@@ -0,0 +1,138 @@
+import re
+import json
+import dateutil
+import urllib
+
+from datapackage_pipelines.generators import slugify
+from datapackage_pipelines.wrapper import ingest, spew
+
+from datapackage_pipelines_measure.processors import google_utils
+
+import logging
+log = logging.getLogger(__name__)
+
+TIMESTAMP_COL = 'A'
+FAR_PAST_START_DATE = '1990-01-01'
+
+
+def _request_data_from_google_spreadsheet(start_date):
+    '''
+    Build a google charts query and append it to an authorised spreadsheets
+    request, returning the response.
+    '''
+    def _build_charts_query(start_date):
+        '''
+        Build and return a charts query to fetch the most recent rows from the
+        spreadsheet, based on the most recent data collected for this source.
+        '''
+        query = '''
+            SELECT *
+            WHERE {timestamp} > date '{start_date}'
+            ORDER BY {timestamp}
+            '''.format(timestamp=TIMESTAMP_COL, start_date=start_date)
+        query = query.strip()
+        return urllib.parse.quote(query)
+
+    def _parse_response_to_dict(response):
+        '''Parse the response from google api and return the bit we want as a
+        native dict'''
+        regexp = re.compile(b"(^\/\*O_o\*\/\\ngoogle\.visualization\.Query\.setResponse\(|\);$)") # noqa
+        return json.loads(re.sub(regexp, b'', response[1]).decode())
+
+    base_request_url = 'https://docs.google.com/spreadsheets/d/{}/gviz/tq?gid={}&headers=1&tq={}' # noqa
+    request_url = base_request_url.format(sheet_id, gid,
+                                          _build_charts_query(start_date))
+
+    authed_http = google_utils.get_authorized_http_object(
+        google_utils.GOOGLE_API_DRIVE_SCOPES)
+
+    raw_response = authed_http.request(request_url)
+
+    response = _parse_response_to_dict(raw_response)
+    if response.get('status') == 'error':
+        raise ValueError('The following error was returned:\n{}'
+                         .format(response['errors'][0].get('detailed_message'))) # noqa
+    return response
+
+
+def form_collector(source_id, source_type, latest_date):
+    start_date = FAR_PAST_START_DATE
+    if latest_date:
+        start_date = latest_date.date()
+
+    response = _request_data_from_google_spreadsheet(start_date)
+
+    resource_content = []
+    headers = response['table']['cols']
+    headers = [slugify(h['label'].lower()) for h in headers]
+    for r in response['table']['rows']:
+        row = r['c']
+        row_dict = {}
+        for i, v in enumerate(row):
+            if v is not None:
+                row_dict[headers[i]] = v.get('f') or v.get('v')
+            else:
+                row_dict[headers[i]] = None
+        output_date = dateutil.parser.parse(row_dict.get('date')).date() \
+            if row_dict.get('date') is not None else None
+        res_row = {
+            'source_id': source_id,
+            'source_type': source_type,
+            'source': 'gsheets',
+            'source_timestamp':
+                dateutil.parser.parse(row_dict.get('timestamp')),
+            'source_email': row_dict.get('email-address'),
+            'output_title': row_dict.get('title'),
+            'output_type': row_dict.get('type-of-output'),
+            'output_organization': row_dict.get('for-what-organisation'),
+            'output_person': row_dict.get('who-did-this'),
+            'output_link': row_dict.get('link-if-published'),
+            'output_date': output_date
+        }
+        resource_content.append(res_row)
+
+    return resource_content
+
+
+def process_resources(res_iter, datapackage, source_id, source_type):
+
+    def get_latest_date(first):
+        latest_date = None
+        my_rows = []
+        for row in first:
+            if row['source_id'] == source_id and row['source'] == 'gsheets':
+                latest_date = row['source_timestamp']
+            my_rows.append(row)
+        return latest_date, iter(my_rows)
+
+    if len(datapackage['resources']):
+        if datapackage['resources'][0]['name'] == 'latest-project-entries':
+            latest_date, latest_iter = get_latest_date(next(res_iter))
+            yield latest_iter
+        else:
+            latest_date = None
+    yield from res_iter
+    yield form_collector(source_id, source_type, latest_date)
+
+
+parameters, datapackage, res_iter = ingest()
+
+sheet_id = parameters['sheet_id']
+gid = parameters['gid']
+source_type = parameters['source_type']
+source_id = '{0}/{1}'.format(sheet_id, gid)
+resource = {
+    'name': slugify(sheet_id).lower(),
+    'path': 'data/{}.csv'.format(slugify(sheet_id))
+}
+
+headers = ['source', 'source_type', 'source_timestamp', 'source_email',
+           'output_title', 'output_type', 'output_organization',
+           'output_person', 'output_link', 'output_date']
+resource['schema'] = {'fields': [{'name': h, 'type': 'string'}
+                                 for h in headers]}
+
+datapackage['resources'].append(resource)
+
+spew(datapackage, process_resources(res_iter, datapackage,
+                                    source_id, source_type))
diff --git a/datapackage_pipelines_measure/processors/add_pypi_resource.py b/datapackage_pipelines_measure/processors/add_pypi_resource.py
@@ -116,7 +116,6 @@ def pypi_collector(package, latest_date):
 parameters, datapackage, res_iter = ingest()
 
 package = parameters['package']
-project_id = parameters['project_id']
 resource = {
     'name': slugify(package),
     'path': 'data/{}.csv'.format(slugify(package))

diff --git a/datapackage_pipelines_measure/processors/datastore_get_latest.py b/datapackage_pipelines_measure/processors/datastore_get_latest.py
@@ -16,6 +16,7 @@
 engine = parameters['engine']
 resource_name = parameters['resource-name']
 distinct_on = ', '.join(parameters['distinct_on'])
+sort_date_key = parameters.get('sort_date_key', 'date')
 
 Base = automap_base()
 engine = create_engine(engine)
@@ -33,7 +34,8 @@
     s = text(
         "SELECT DISTINCT ON ({0}) *"
         "FROM {1} "
-        "ORDER BY {0}, date DESC".format(distinct_on, table)
+        "ORDER BY {0}, {2} DESC"
+        .format(distinct_on, table, sort_date_key)
     )
 
     results = session.query(Table).from_statement(s).all()