Merge branch '39/packagist'

okfn · Jul 7, 2017 · 41df667 · 41df667
2 parents 4e5ccd9 + 11e09e0
commit 41df667
Show file tree

Hide file tree

Showing 7 changed files with 378 additions and 2 deletions.
diff --git a/README.md b/README.md
@@ -39,6 +39,7 @@ Potentially, we'd love to see interest from other non-profits who receive funds
     - [NPM](#npm)
     - [PyPI](#pypi)
     - [RubyGems](#rubygems)
+    - [Packagist](#packagist)
   - [Social Media](#social-media)
     - [Twitter](#twitter)
     - [Facebook](#facebook)
@@ -161,7 +162,7 @@ The PyPI processor requires a Google API account with generated credential to ma
 
 #### RubyGems
 
-The RubyGems processor collects gem package download data from the rubygems.org API.
+The RubyGems processor collects ruby gem download data from the rubygems.org API.
 
 - **total_downloads**: collected directly from the API
 - **downloads**: daily downloads calculated from yesterday's `total_downloads` value, if present.
@@ -177,6 +178,26 @@ config:
 
 No historical download data is collected for RubyGems.
 
+#### Packagist
+
+The Packagist processor collects PHP package daily download data from the packagist.org API.
+
+- **downloads**: daily downloads collected directly from the API.
+
+```yaml
+config:
+  code-packaging:
+    packagist:
+      packages:
+        - "frictionlessdata/tableschema"
+        - "frictionlessdata/datapackage"
+```
+
+Note: `packages` defined in the config must include their owner organization in the form `organization_name/package_name`.
+
+Results from the Packagist.org API appear to be a couple of days behind.
+
+
 ### Social Media
 
 #### Twitter

diff --git a/datapackage_pipelines_measure/pipeline_steps/code_packaging.py b/datapackage_pipelines_measure/pipeline_steps/code_packaging.py
@@ -37,6 +37,12 @@ def add_steps(steps: list, pipeline_id: str,
                 'gem_id': gem
             }))
 
+    if 'packagist' in config:
+        for package in config['packagist']['packages']:
+            steps.append(('measure.add_packagist_resource', {
+                'package': package
+            }))
+
     steps.append(('measure.remove_resource', {
         'name': 'latest-project-entries'
     }))

diff --git a/datapackage_pipelines_measure/processors/add_packagist_resource.py b/datapackage_pipelines_measure/processors/add_packagist_resource.py
@@ -0,0 +1,108 @@
+import dateutil
+from collections import OrderedDict
+
+import simplejson
+import requests
+
+from datapackage_pipelines.generators import slugify
+from datapackage_pipelines.wrapper import ingest, spew
+
+import logging
+log = logging.getLogger(__name__)
+
+
+def _request_data_from_packagist(endpoint):
+    '''Request data and handle errors from packagist.org REST API.'''
+
+    packagist_url = 'https://packagist.org{endpoint}' \
+        .format(endpoint=endpoint)
+
+    packagist_response = requests.get(packagist_url)
+
+    if (packagist_response.status_code != 200):
+        log.error('An error occurred fetching Packagist data: {}'
+                  .format(packagist_response.text))
+        raise Exception(packagist_response.text)
+
+    try:
+        json_response = packagist_response.json()
+    except simplejson.scanner.JSONDecodeError as e:
+        log.error('Expected JSON in response from: {}'.format(packagist_url))
+        raise e
+
+    return json_response
+
+
+def _request_package_stats_from_packagist(package):
+    '''Request general info for a package.'''
+    endpoint = '/packages/{package}/stats/all.json'.format(package=package)
+    json_response = _request_data_from_packagist(endpoint)
+    return json_response
+
+
+def packagist_collector(package, latest_row):
+    package_info = _request_package_stats_from_packagist(package)
+    download_by_date = dict(zip(package_info['labels'],
+                                package_info['values']))
+
+    if latest_row:
+        # If there's a latest_row, reject all items in download_by_date before
+        # latest_row date
+        latest_row_date_str = latest_row['date'].strftime('%Y-%m-%d')
+        download_by_date = {k: v for k, v in download_by_date.items()
+                            if k >= latest_row_date_str}
+
+    # ensure dict is ordered by date key
+    download_by_date = OrderedDict(sorted(download_by_date.items()))
+
+    resource_content = []
+    for k, v in download_by_date.items():
+        res_row = {
+            'package': package.split('/')[-1],
+            'source': 'packagist',
+            'date': dateutil.parser.parse(k).date(),
+            'downloads': v
+        }
+        resource_content.append(res_row)
+
+    return resource_content
+
+
+parameters, datapackage, res_iter = ingest()
+
+package = parameters['package']
+resource = {
+    'name': slugify(package),
+    'path': 'data/{}.csv'.format(slugify(package))
+}
+
+headers = ['package', 'source', 'date', 'downloads']
+resource['schema'] = {'fields': [{'name': h, 'type': 'string'}
+                                 for h in headers]}
+
+datapackage['resources'].append(resource)
+
+
+def process_resources(res_iter, datapackage, package):
+
+    def get_latest_row(first):
+        latest_row = None
+        package_name = package.split('/')[-1]
+        my_rows = []
+        for row in first:
+            if row['package'] == package_name and row['source'] == 'packagist':
+                latest_row = row
+            my_rows.append(row)
+        return latest_row, iter(my_rows)
+
+    if len(datapackage['resources']):
+        if datapackage['resources'][0]['name'] == 'latest-project-entries':
+            latest_row, latest_iter = get_latest_row(next(res_iter))
+            yield latest_iter
+        else:
+            latest_row = None
+    yield from res_iter
+    yield packagist_collector(package, latest_row)
+
+
+spew(datapackage, process_resources(res_iter, datapackage, package))
diff --git a/datapackage_pipelines_measure/schemas/measure_spec_schema.json b/datapackage_pipelines_measure/schemas/measure_spec_schema.json
@@ -72,6 +72,15 @@
                 }
               },
               "required": ["gems"]
+            },
+            "packagist": {
+              "type": "object",
+              "properties": {
+                "packages": {
+                  "type": "array"
+                }
+              },
+              "required": ["packages"]
             }
           }
         },

diff --git a/projects/frictionlessdata/measure.source-spec.yaml b/projects/frictionlessdata/measure.source-spec.yaml
@@ -32,6 +32,10 @@ config:
       gems:
         - 'tableschema'
         - 'datapackage'
+    packagist:
+      packages:
+        - 'frictionlessdata/tableschema'
+        - 'frictionlessdata/datapackage'
 
   code-hosting:
     github:

diff --git a/setup.py b/setup.py
@@ -18,7 +18,7 @@ def read(*paths):
 PACKAGE = 'datapackage_pipelines_measure'
 NAME = PACKAGE.replace('_', '-')
 INSTALL_REQUIRES = [
-    'datapackage-pipelines',
+    'datapackage-pipelines==1.0.19',
     'psycopg2',
     'tweepy',
     'facebook-sdk',