Skip to content
This repository has been archived by the owner on Dec 2, 2021. It is now read-only.

Commit

Permalink
Merge branch '6/mailchimp'
Browse files Browse the repository at this point in the history
  • Loading branch information
brew committed Jul 5, 2017
2 parents 2a25c0d + f8e35ed commit 9dddf9d
Show file tree
Hide file tree
Showing 10 changed files with 671 additions and 2 deletions.
57 changes: 57 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,33 @@ Potentially, we'd love to see interest from other non-profits who receive funds

## Project Configuration

<!-- MarkdownTOC autolink="true" bracket="round" depth=3 -->

- [Code Hosting](#code-hosting)
- [Github](#github)
- [Code Packaging](#code-packaging)
- [NPM](#npm)
- [PyPI](#pypi)
- [Social Media](#social-media)
- [Twitter](#twitter)
- [Facebook](#facebook)
- [Website Analytics](#website-analytics)
- [Google Analytics](#google-analytics)
- [Outputs](#outputs)
- [Outputs Captured by Google Forms](#outputs-captured-by-google-forms)
- [Email Campaigns](#email-campaigns)
- [MailChimp](#mailchimp)
- [Environmental Variables](#environmental-variables)
- [General](#general)
- [Github](#github-1)
- [Twitter](#twitter-1)
- [Facebook](#facebook-1)
- [Google credentials for PyPI, Google analytics, and Outputs](#google-credentials-for-pypi-google-analytics-and-outputs)
- [MailChimp](#mailchimp-1)

<!-- /MarkdownTOC -->


Each project has a `measure.source-spec.yaml` configuration file within a project directory in `/projects`, e.g. for the Frictionless Data project:

```
Expand Down Expand Up @@ -302,6 +329,33 @@ config:
type: "internal"
```


### Email Campaigns

#### MailChimp

The MailChimp processor collects email list data each day. For each list the following is collected:

- **subscribers**: The current total number of subscribers to the list.
- **subs**: The number of added subscribes that day. Counts both opt-ins, and other additions made by admins.
- **unsubs**: The number of removed subscribers that day. Counts both unsubsribes, and other removals by admins.
- **campaigns_sent**: The number of campaigns sent that day.

The processor will attempt to collect historic data upto the creation date of the list. Complete data is collected for `subs`, `unsubs`, and `campaigns_sent`. Partial historic data is collected for `subscribers`; once for the last day of each month when collecting historic data.

List ids are added to the project config file:

```yaml
config:
email:
mailchimp:
lists:
- 'my-mailchimp-list-id'
- 'another-mailchimp-list-id'
```

A MailChimp API key must be defined as an environmental variable. See below for details.

## Environmental Variables

Each installation of Measure requires certain environmental variables to be set.
Expand Down Expand Up @@ -337,3 +391,6 @@ See the [PyPI Big Query API](#pypi-configuration) instructions above to get the
- `MEASURE_GOOGLE_API_JWT_TOKEN_URI`: {token_uri}
- `MEASURE_GOOGLE_API_JWT_TYPE`: {type}

### MailChimp

- `MEASURE_MAILCHIMP_API_TOKEN`: {mailchimp_api_key} (note: must include the data center code, e.g. `123abc456def-dc1`, where `dc1` is the data center code).
3 changes: 2 additions & 1 deletion datapackage_pipelines_measure/pipeline_steps/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,9 @@
code_packaging,
website_analytics,
outputs,
email,
example
)

__all__ = ['social_media', 'code_hosting', 'code_packaging',
'website_analytics', 'outputs', 'example']
'website_analytics', 'outputs', 'email', 'example']
95 changes: 95 additions & 0 deletions datapackage_pipelines_measure/pipeline_steps/email.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
import os

from datapackage_pipelines_measure.config import settings

import logging
log = logging.getLogger(__name__)

DOWNLOADS_PATH = os.path.join(os.path.dirname(__file__), '../../downloads')

label = 'email'


def add_steps(steps: list, pipeline_id: str,
project_id: str, config: dict) -> list:

steps.append(('measure.datastore_get_latest', {
'resource-name': 'latest-project-entries',
'table': 'email',
'engine': settings.get('DB_ENGINE'),
'distinct_on': ['project_id', 'source', 'list_id']
}))

if 'mailchimp' in config:
for list_id in config['mailchimp']['lists']:
steps.append(('measure.add_mailchimp_resource', {
'list_id': list_id
}))

steps.append(('measure.remove_resource', {
'name': 'latest-project-entries'
}))

steps.append(('concatenate', {
'target': {
'name': 'email',
'path': 'data/email.csv'},
'fields': {
'source': [],
'list_id': [],
'date': [],
'subscribers': [],
'subs': [],
'unsubs': [],
'campaigns_sent': []
}
}))

steps.append(('set_types', {
'types': {
'source': {
'type': 'string'
},
'list_id': {
'type': 'string'
},
'date': {
'type': 'date'
},
'subscribers': {
'type': 'integer'
},
'subs': {
'type': 'integer'
},
'unsubs': {
'type': 'integer'
},
'campaigns_sent': {
'type': 'integer'
}
}
}))

steps.append(('measure.add_project_name', {'name': project_id}))
steps.append(('measure.add_timestamp'))
steps.append(('measure.add_uuid'))

# Dump to path if in development mode
if settings.get('DEVELOPMENT', False):
steps.append(('dump.to_path', {
'out-path': '{}/{}'.format(DOWNLOADS_PATH, pipeline_id)
}))

steps.append(('dump.to_sql', {
'engine': settings.get('DB_ENGINE'),
'tables': {
'email': {
'resource-name': 'email',
'mode': 'update',
'update_keys': ['date', 'source', 'list_id', 'project_id']
}
}
}))

return steps
190 changes: 190 additions & 0 deletions datapackage_pipelines_measure/processors/add_mailchimp_resource.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,190 @@
import collections
import calendar
import datetime
import dateutil

import simplejson
import requests
from requests.auth import HTTPBasicAuth

from datapackage_pipelines.generators import slugify
from datapackage_pipelines.wrapper import ingest, spew

from datapackage_pipelines_measure.config import settings

import logging
log = logging.getLogger(__name__)


def _request_data_from_mailchimp(endpoint):
'''Request data and handle errors from MailChimp REST API.'''
api_token = settings['MAILCHIMP_API_TOKEN']
data_center = api_token.split('-')[-1]
mailchimp_url = 'https://{dc}.api.mailchimp.com/3.0{endpoint}' \
.format(dc=data_center, endpoint=endpoint)

mailchimp_response = requests.get(mailchimp_url,
auth=HTTPBasicAuth('username',
api_token))

if (mailchimp_response.status_code != 200):
log.error('An error occurred fetching MailChimp data: {}'
.format(mailchimp_response.json()['detail']))
raise Exception(mailchimp_response.json()['detail'])

try:
json_response = mailchimp_response.json()
except simplejson.scanner.JSONDecodeError as e:
log.error('Expected JSON in response from: {}'.format(mailchimp_url))
raise e

return json_response


def _request_general_stats_from_mailchimp(list_id):
'''Request general list data from MailChimp.'''
endpoint = '/lists/{list_id}'.format(list_id=list_id)
json_response = _request_data_from_mailchimp(endpoint)
return json_response


def _request_activity_stats_from_mailchimp(list_id, count):
'''Request activity for the list_id from MailChimp.'''
endpoint = '/lists/{list_id}/activity?count={count}' \
.format(list_id=list_id, count=count)
json_response = _request_data_from_mailchimp(endpoint)
return json_response


def _request_campaign_stats_from_mailchimp(list_id, since):
'''Request campaign stats for the list_id from MailChimp, where the
send_time is after `since` (inclusive).'''
endpoint = '/campaigns/?list_id={list_id}&since_send_time={since}' \
.format(list_id=list_id, since=since)
json_response = _request_data_from_mailchimp(endpoint)
return json_response


def _request_growth_history_from_mailchimp(list_id, year_month):
'''Request growth-history for a give 'yyyy-mm' from MailChimp.'''
endpoint = '/lists/{list_id}/growth-history/{year_month}' \
.format(list_id=list_id, year_month=year_month)
json_response = _request_data_from_mailchimp(endpoint)
return json_response


def _get_start_date(default_start, latest_date=None):
'''Determine when data collection should start.
:latest_date: the most recent date data was collected for this list_id, if
it exists
'''
if latest_date:
return max(latest_date, default_start)
else:
return default_start


def _get_campaigns_number_by_date(list_id, start_date):
'''Return a Counter where the key is a date, and value is the number of
campaigns sent on that date'''
campaigns = _request_campaign_stats_from_mailchimp(list_id, start_date)
campaigns_sent = [dateutil.parser.parse(c['send_time']).date()
for c in campaigns['campaigns']]
return collections.Counter(campaigns_sent)


def mailchimp_collector(list_id, latest_row):
general_stats = _request_general_stats_from_mailchimp(list_id)
list_created = dateutil.parser.parse(general_stats['date_created']).date()

latest_date = latest_row['date'] if latest_row else None
start_date = _get_start_date(list_created, latest_date)
delta = datetime.date.today() - start_date
# Count the number of days from the start_date to today. Add an extra day
# to include the previous entry, which already exists in the db. We want to
# update its `subs` and `unsubs` but retain its `subscribers` value.
day_count = delta.days + 1

activity_stats = _request_activity_stats_from_mailchimp(list_id,
count=day_count)

# Get campaign stats for activity_date as a Counter({date obj: integer})
campaigns_dates = _get_campaigns_number_by_date(list_id, start_date)

resource_content = []
for activity in activity_stats['activity']:
activity_date = dateutil.parser.parse(activity['day']).date()
res_row = {
'source': 'mailchimp',
'list_id': list_id,
'date': activity_date,
'subs': activity['subs'] + activity['other_adds'],
'unsubs': activity['unsubs'] + activity['other_removes']
}
# If date of activity is today, add the subscribers data from general
# stats.
if activity_date == datetime.date.today():
res_row['subscribers'] = general_stats['stats']['member_count']
# If date of activity is also the latest existing row, add its
# subscribers value to the new row, retaining it when updated to db.
if activity_date == latest_date:
res_row['subscribers'] = latest_row['subscribers']
# Add number of campaigns sent from `campaigns_dates`.
res_row['campaigns_sent'] = campaigns_dates.get(activity_date, 0)
# We can collect historical `subscribers` data from MailChimp for the
# last day of each month. Let's do that if activity_date is the last in
# month, and we haven't already populated the value above.
activity_month_range = calendar.monthrange(activity_date.year,
activity_date.month)
if activity_date.day == activity_month_range[1] \
and 'subscribers' not in res_row:
growth = _request_growth_history_from_mailchimp(
list_id,
'{}-{:02d}'.format(activity_date.year, activity_date.month)
)
res_row['subscribers'] = growth['existing']

resource_content.append(res_row)

return resource_content


parameters, datapackage, res_iter = ingest()

list_id = parameters['list_id']
resource = {
'name': slugify(list_id),
'path': 'data/{}.csv'.format(slugify(list_id))
}

headers = ['source', 'date', 'list_id', 'subs', 'unsubs', 'subscribers',
'campaigns_sent']
resource['schema'] = {'fields': [{'name': h, 'type': 'string'}
for h in headers]}

datapackage['resources'].append(resource)


def process_resources(res_iter, datapackage, list_id):

def get_latest_row(first):
latest_row = None
my_rows = []
for row in first:
if row['list_id'] == list_id and row['source'] == 'mailchimp':
latest_row = row
my_rows.append(row)
return latest_row, iter(my_rows)

if len(datapackage['resources']):
if datapackage['resources'][0]['name'] == 'latest-project-entries':
latest_row, latest_iter = get_latest_row(next(res_iter))
yield latest_iter
else:
latest_row = None
yield from res_iter
yield mailchimp_collector(list_id, latest_row)


spew(datapackage, process_resources(res_iter, datapackage, list_id))
18 changes: 18 additions & 0 deletions datapackage_pipelines_measure/schemas/measure_spec_schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,24 @@
"required": ["sheetid", "gid", "type"]
},
"minItems": 1
},
"email": {
"type": "object",
"properties": {
"mailchimp": {
"type": "object",
"properties": {
"lists": {
"type": "array",
"items": {
"type": "string",
"minItems": 1
}
}
},
"required": ["lists"]
}
}
}
}
}
Expand Down
Loading

0 comments on commit 9dddf9d

Please sign in to comment.