Skip to content
This repository has been archived by the owner on Dec 2, 2021. It is now read-only.

Commit

Permalink
Merge e3474c4 into 251a6e4
Browse files Browse the repository at this point in the history
  • Loading branch information
brew committed May 9, 2017
2 parents 251a6e4 + e3474c4 commit 220cb65
Show file tree
Hide file tree
Showing 12 changed files with 638 additions and 26 deletions.
30 changes: 28 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,27 @@ config:
```


#### Twitter

The Twitter processor collects data about each entity listed in the `entities` section. Entities can be either a twitter hashtag (starting with `#`) or an account name (starting with a `@`). For each entity, the processor collects:

- **mentions**: an accumulated total of tweets mentioning the entity
- **interactions**: an accumulated total of 'favorites' and 'retweets' for tweets mentioning the hashtag, or tweets authored by the account.

And additionally, for account entities:
- the current number of **followers**

```yaml
config:
social-media:
twitter:
entities:
- "#frictionlessdata"
- "#datapackages"
- "@okfnlabs"
```


## Installation

### Environmental Variables
Expand All @@ -98,5 +119,10 @@ Each installation of Measure requires certain environmental variables to be set.

#### Github

- `MEASURE_GITHUB_API_BASE_URL`: Github api base url (`https://api.github.com/repos/`)
- `MEASURE_GITHUB_API_TOKEN`: Github api token used for making requests
- `MEASURE_GITHUB_API_BASE_URL`: Github API base url (`https://api.github.com/repos/`)
- `MEASURE_GITHUB_API_TOKEN`: Github API token used for making requests

#### Twitter

- `MEASURE_TWITTER_API_CONSUMER_KEY`: Twitter app API consumer key
- `MEASURE_TWITTER_API_CONSUMER_SECRET`: Twitter app API consumer secret
6 changes: 6 additions & 0 deletions datapackage_pipelines_measure/datastore/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
from .sql import SQLDatastore


def get_datastore():
datastore = SQLDatastore()
return datastore
42 changes: 42 additions & 0 deletions datapackage_pipelines_measure/datastore/sql.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session
from sqlalchemy import create_engine, desc

from datapackage_pipelines_measure.config import settings

import logging
log = logging.getLogger(__name__)


class SQLDatastore():

def __init__(self):
self.Base = automap_base()
self.engine = create_engine(settings.DB_ENGINE)
# Reflect the tables
self.Base.prepare(self.engine, reflect=True)

def get_latest_from_table(self, filter, table):
'''
Get the most recent row from a table with the passed filter.
Return result as a dict (or None).
'''
try:
Table = self.Base.classes[table]
except KeyError:
# No table in database
return None

session = Session(self.engine)
row = session.query(Table) \
.order_by(desc(Table.timestamp)) \
.filter_by(**filter) \
.first()

if row is None:
return None

# Return row's columns as dict
return dict((col, getattr(row, col))
for col in row.__table__.columns.keys())
3 changes: 2 additions & 1 deletion datapackage_pipelines_measure/pipeline_steps/code_hosting.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,8 @@ def add_steps(steps: list, pipeline_id: str,
'fields': {
'repository': [],
'watchers': [],
'stars': []}
'stars': [],
'source': []}
}))

steps.append(('set_types', {
Expand Down
87 changes: 72 additions & 15 deletions datapackage_pipelines_measure/pipeline_steps/social_media.py
Original file line number Diff line number Diff line change
@@ -1,23 +1,80 @@
import os

ROOT_PATH = os.path.join(os.path.dirname(__file__), '..', '..')
from datapackage_pipelines_measure.config import settings


DOWNLOADS_PATH = os.path.join(os.path.dirname(__file__), '../../downloads')

label = 'social-media'


def add_steps(steps: list, pipeline_id: str,
project_id: str, config: dict) -> list:
return steps + [
('add_resource', {
'name': 'test_resource',
'url': 'https://docs.google.com/spreadsheets/d/' +
'1vbhTuMDNCmxdo2rPkkya9v6X1f9eyqvSGsY5YcxlcLk/' +
'edit#gid=0'
}),
('stream_remote_resources', {}),
('measure.capitalise', {}),
('dump.to_path', {
'out-path':
'{}/downloads/{}'.format(ROOT_PATH, pipeline_id)
})
]
for entity in config['twitter']['entities']:
steps.append(('measure.add_twitter_resource', {
'entity': entity,
'project_id': project_id
}))

steps.append(('concatenate', {
'target': {
'name': 'social-media',
'path': 'data/social-media.json'},
'fields': {
'entity': [],
'entity_type': [],
'source': [],
'date': [],
'followers': [],
'mentions': [],
'interactions': []}
}))

steps.append(('set_types', {
'types': {
'entity': {
'type': 'string',
},
'entity_type': {
'type': 'string'
},
'source': {
'type': 'string'
},
'date': {
'type': 'date',
},
'followers': {
'type': 'integer'
},
'mentions': {
'type': 'integer'
},
'interactions': {
'type': 'integer'
}
}
}))

steps.append(('measure.add_project_name', {'name': project_id}))
steps.append(('measure.add_timestamp'))
steps.append(('measure.add_uuid'))

# temporarily dump to path for development
steps.append(('dump.to_path', {
'out-path': '{}/{}'.format(DOWNLOADS_PATH, pipeline_id)
}))

steps.append(('dump.to_sql', {
'engine': settings.DB_ENGINE,
'tables': {
'socialmedia': {
'resource-name': 'social-media',
'mode': 'update',
'update_keys': ['entity', 'entity_type',
'source', 'project_id', 'date']
}
}
}))

return steps
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
# remap retrieved dict to scheme in parameters
resource_content = {t_key: repo_content[s_key]
for t_key, s_key in parameters['map_fields'].items()}
resource_content['source'] = 'github'

resource = {
'name': name,
Expand Down
6 changes: 6 additions & 0 deletions datapackage_pipelines_measure/processors/add_timestamp.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,9 @@
'''This processor adds a timestamp for the datetime of collection.
Note: the collection date may not reflect the date the data collected pertains
to, especially for historic data.
'''

import datetime

from datapackage_pipelines.wrapper import process
Expand Down

0 comments on commit 220cb65

Please sign in to comment.