Permalink
Browse files

feat(scripts): dockerize the import/summarize scripts

  • Loading branch information...
philbooth committed Dec 12, 2018
1 parent d93c357 commit be848b478faa6c8a54a169414d58271e87d680b6
Showing with 159 additions and 44 deletions.
  1. +54 −0 .circleci/config.yml
  2. +17 −0 Dockerfile
  3. +0 −4 Makefile
  4. +10 −7 calculate_daily_summary.py
  5. +10 −4 calculate_daily_summary_retro.py
  6. +20 −9 import_activity_events_retro.py
  7. +23 −9 import_counts.py
  8. +25 −11 import_events.py
@@ -0,0 +1,54 @@
version: 2

jobs:
build:
docker:
- image: circleci/python:2.7

working_directory: ~/fxa

steps:
- checkout

- setup_remote_docker

- run:
name: Build docker image
command: docker build -f Dockerfile -t fxa-activity-metrics .

- run:
name: Deploy to dockerhub
shell: /bin/bash
command: |
if [ "${CIRCLE_BRANCH}" == "master" ]; then
DOCKER_TAG="latest"
fi
if [[ "${CIRCLE_BRANCH}" == feature* ]] || [[ "${CIRCLE_BRANCH}" == dockerpush* ]]; then
DOCKER_TAG="${CIRCLE_BRANCH}"
fi
if [ -n "${CIRCLE_TAG}" ]; then
DOCKER_TAG="$CIRCLE_TAG"
fi
if [ -n "${DOCKER_TAG}" ]; then
echo "$DOCKER_PASS" | docker login -u "$DOCKER_USER" --password-stdin
echo ${DOCKERHUB_REPO}:${DOCKER_TAG}
docker tag fxa-activity-metrics ${DOCKERHUB_REPO}:${DOCKER_TAG}
docker images
docker push ${DOCKERHUB_REPO}:${DOCKER_TAG}
fi
workflows:
version: 2

build-test-push:
jobs:
- build:
filters:
branches:
ignore: /.*/

tags:
only: /^v[0-9.]+$/
@@ -0,0 +1,17 @@
FROM python:2.7-slim

RUN apt-get update
RUN apt-get install -y build-essential python-virtualenv

RUN groupadd --gid 10001 app
RUN useradd --uid 10001 --gid 10001 --home /app --create-home app

WORKDIR /app

COPY . /app

RUN make build

USER app

CMD ["make", "import"]
@@ -3,7 +3,6 @@ VIRTUALENV = virtualenv --python=$(SYSTEMPYTHON)
ENV = ./build
PIP_INSTALL = $(ENV)/bin/pip install


.PHONY: all
all: build

@@ -20,7 +19,4 @@ import: | $(ENV)/COMPLETE
$(ENV)/bin/python ./import_flow_events.py
$(ENV)/bin/python ./import_email_events.py
$(ENV)/bin/python ./import_counts.py

.PHONY: summarize
summarize: | $(ENV)/COMPLETE
$(ENV)/bin/python ./calculate_daily_summary.py
@@ -10,16 +10,19 @@
import json
import time
import datetime

import os
import postgres

# Load config from disk,
# and pull in credentials from the environment.

with open("config.json") as f:
CONFIG = json.loads(f.read())
REDSHIFT_USER = os.environ["REDSHIFT_USER"]
REDSHIFT_PASSWORD = os.environ["REDSHIFT_PASSWORD"]
REDSHIFT_HOST = os.environ["REDSHIFT_HOST"]
REDSHIFT_PORT = os.environ["REDSHIFT_PORT"]
REDSHIFT_DBNAME = os.environ["REDSHIFT_DBNAME"]

DB = "postgresql://{db_username}:{db_password}@{db_host}:{db_port}/{db_name}".format(**CONFIG)
DB = "postgresql://{REDSHIFT_USER}:{REDSHIFT_PASSWORD}@{REDSHIFT_HOST}:{REDSHIFT_PORT}/{REDSHIFT_DBNAME}".format(
REDSHIFT_USER=REDSHIFT_USER, REDSHIFT_PASSWORD=REDSHIFT_PASSWORD, REDSHIFT_HOST=REDSHIFT_HOST,
REDSHIFT_PORT=REDSHIFT_PORT, REDSHIFT_DBNAME=REDSHIFT_DBNAME
)

TABLE_SUFFIXES = (
"_sampled_10",
@@ -10,16 +10,22 @@
import json
import time
import datetime

import os
import postgres

# Load config from disk,
# and pull in credentials from the environment.

with open("config.json") as f:
CONFIG = json.loads(f.read())
REDSHIFT_USER = os.environ["REDSHIFT_USER"]
REDSHIFT_PASSWORD = os.environ["REDSHIFT_PASSWORD"]
REDSHIFT_HOST = os.environ["REDSHIFT_HOST"]
REDSHIFT_PORT = os.environ["REDSHIFT_PORT"]
REDSHIFT_DBNAME = os.environ["REDSHIFT_DBNAME"]

DB = "postgresql://{db_username}:{db_password}@{db_host}:{db_port}/{db_name}".format(**CONFIG)
DB = "postgresql://{REDSHIFT_USER}:{REDSHIFT_PASSWORD}@{REDSHIFT_HOST}:{REDSHIFT_PORT}/{REDSHIFT_DBNAME}".format(
REDSHIFT_USER=REDSHIFT_USER, REDSHIFT_PASSWORD=REDSHIFT_PASSWORD, REDSHIFT_HOST=REDSHIFT_HOST,
REDSHIFT_PORT=REDSHIFT_PORT, REDSHIFT_DBNAME=REDSHIFT_DBNAME
)

TABLE_SUFFIXES = (
"_sampled_10",
@@ -17,15 +17,26 @@
# Load config from disk,
# and pull in credentials from the environment.

with open("config.json") as f:
CONFIG = json.loads(f.read())
REDSHIFT_USER = os.environ["REDSHIFT_USER"]
REDSHIFT_PASSWORD = os.environ["REDSHIFT_PASSWORD"]
REDSHIFT_HOST = os.environ["REDSHIFT_HOST"]
REDSHIFT_PORT = os.environ["REDSHIFT_PORT"]
REDSHIFT_DBNAME = os.environ["REDSHIFT_DBNAME"]

DB = "postgresql://{REDSHIFT_USER}:{REDSHIFT_PASSWORD}@{REDSHIFT_HOST}:{REDSHIFT_PORT}/{REDSHIFT_DBNAME}".format(
REDSHIFT_USER=REDSHIFT_USER, REDSHIFT_PASSWORD=REDSHIFT_PASSWORD, REDSHIFT_HOST=REDSHIFT_HOST,
REDSHIFT_PORT=REDSHIFT_PORT, REDSHIFT_DBNAME=REDSHIFT_DBNAME
)

def env_or_default(variable_name, default_value):
if variable_name in os.environ:
return os.environ[variable_name]

if "aws_access_key_id" not in CONFIG:
p = boto.provider.Provider("aws")
CONFIG["aws_access_key_id"] = p.get_access_key()
CONFIG["aws_secret_access_key"] = p.get_secret_key()
return default_value

DB = "postgresql://{db_username}:{db_password}@{db_host}:{db_port}/{db_name}".format(**CONFIG)
p = boto.provider.Provider("aws")
AWS_ACCESS_KEY = env_or_default("AWS_ACCESS_KEY", p.get_access_key())
AWS_SECRET_KEY = env_or_default("AWS_SECRET_KEY", p.get_secret_key())

# Event data files are named like "events-2016-02-15.csv"
# and contain events for the specified date.
@@ -92,7 +103,7 @@
device_id
)
FROM '{s3path}'
CREDENTIALS 'aws_access_key_id={aws_access_key_id};aws_secret_access_key={aws_secret_access_key}'
CREDENTIALS 'aws_access_key_id={AWS_ACCESS_KEY};aws_secret_access_key={AWS_SECRET_KEY}'
FORMAT AS CSV
TRUNCATECOLUMNS;
"""
@@ -174,7 +185,7 @@ def import_events(force_reload=False):
db.run(Q_CLEAR_DAY.format(suffix=rate["suffix"], day=day))
s3path = EVENTS_FILE_URL.format(day=day)
# Copy data from s3 into redshift
db.run(Q_COPY_CSV.format(s3path=s3path, **CONFIG))
db.run(Q_COPY_CSV.format(s3path=s3path, AWS_ACCESS_KEY=AWS_ACCESS_KEY, AWS_SECRET_KEY=AWS_SECRET_KEY))
# Populate the activity_events table
for rate in SAMPLE_RATES:
db.run(Q_INSERT_EVENTS.format(suffix=rate["suffix"], percent=rate["percent"], last_day=last_day, months=rate["months"]))
@@ -4,20 +4,34 @@
import boto.s3
import boto.provider
import postgres
import os

with open("config.json") as f:
CONFIG = json.loads(f.read())
REDSHIFT_USER = os.environ["REDSHIFT_USER"]
REDSHIFT_PASSWORD = os.environ["REDSHIFT_PASSWORD"]
REDSHIFT_HOST = os.environ["REDSHIFT_HOST"]
REDSHIFT_PORT = os.environ["REDSHIFT_PORT"]
REDSHIFT_DBNAME = os.environ["REDSHIFT_DBNAME"]

if "aws_access_key_id" not in CONFIG:
aws = boto.provider.Provider("aws")
CONFIG["aws_access_key_id"] = aws.get_access_key()
CONFIG["aws_secret_access_key"] = aws.get_secret_key()
DB_URI = "postgresql://{REDSHIFT_USER}:{REDSHIFT_PASSWORD}@{REDSHIFT_HOST}:{REDSHIFT_PORT}/{REDSHIFT_DBNAME}".format(
REDSHIFT_USER=REDSHIFT_USER, REDSHIFT_PASSWORD=REDSHIFT_PASSWORD, REDSHIFT_HOST=REDSHIFT_HOST,
REDSHIFT_PORT=REDSHIFT_PORT, REDSHIFT_DBNAME=REDSHIFT_DBNAME
)

def env_or_default(variable_name, default_value):
if variable_name in os.environ:
return os.environ[variable_name]

return default_value

aws = boto.provider.Provider("aws")
AWS_ACCESS_KEY = env_or_default("AWS_ACCESS_KEY", aws.get_access_key())
AWS_SECRET_KEY = env_or_default("AWS_SECRET_KEY", aws.get_secret_key())

S3_REGION = "us-east-1"
S3_BUCKET = "net-mozaws-prod-us-west-2-pipeline-analysis"
S3_PREFIX = "fxa-basic-metrics/"
S3_URI = "s3://" + S3_BUCKET + "/" + S3_PREFIX + "fxa-basic-metrics-{day}.txt"
DB_URI = "postgresql://{db_username}:{db_password}@{db_host}:{db_port}/{db_name}".format(**CONFIG)

COUNTS_BEGIN = datetime.strptime("2017-05-30", "%Y-%m-%d")

Q_DROP_CSV_TABLE = "DROP TABLE IF EXISTS temporary_raw_counts;"
@@ -51,10 +65,10 @@
Q_COPY_CSV = """
COPY temporary_raw_counts (day, accounts, verified_accounts)
FROM '{s3_uri}'
CREDENTIALS 'aws_access_key_id={aws_access_key_id};aws_secret_access_key={aws_secret_access_key}'
CREDENTIALS 'aws_access_key_id={AWS_ACCESS_KEY};aws_secret_access_key={AWS_SECRET_KEY}'
FORMAT AS CSV
TRUNCATECOLUMNS;
""".format(s3_uri=S3_URI, **CONFIG)
""".format(s3_uri=S3_URI, AWS_ACCESS_KEY=AWS_ACCESS_KEY, AWS_SECRET_KEY=AWS_SECRET_KEY)

Q_INSERT_COUNTS = """
INSERT INTO counts (day, accounts, verified_accounts)
@@ -4,17 +4,30 @@
import boto.s3
import boto.provider
import postgres
import os

with open("config.json") as f:
CONFIG = json.loads(f.read())
REDSHIFT_USER = os.environ["REDSHIFT_USER"]
REDSHIFT_PASSWORD = os.environ["REDSHIFT_PASSWORD"]
REDSHIFT_HOST = os.environ["REDSHIFT_HOST"]
REDSHIFT_PORT = os.environ["REDSHIFT_PORT"]
REDSHIFT_DBNAME = os.environ["REDSHIFT_DBNAME"]

if "aws_access_key_id" not in CONFIG:
aws = boto.provider.Provider("aws")
CONFIG["aws_access_key_id"] = aws.get_access_key()
CONFIG["aws_secret_access_key"] = aws.get_secret_key()
DB_URI = "postgresql://{REDSHIFT_USER}:{REDSHIFT_PASSWORD}@{REDSHIFT_HOST}:{REDSHIFT_PORT}/{REDSHIFT_DBNAME}".format(
REDSHIFT_USER=REDSHIFT_USER, REDSHIFT_PASSWORD=REDSHIFT_PASSWORD, REDSHIFT_HOST=REDSHIFT_HOST,
REDSHIFT_PORT=REDSHIFT_PORT, REDSHIFT_DBNAME=REDSHIFT_DBNAME
)

def env_or_default(variable_name, default_value):
if variable_name in os.environ:
return os.environ[variable_name]

return default_value

aws = boto.provider.Provider("aws")
AWS_ACCESS_KEY = env_or_default("AWS_ACCESS_KEY", aws.get_access_key())
AWS_SECRET_KEY = env_or_default("AWS_SECRET_KEY", aws.get_secret_key())

S3_BUCKET = "net-mozaws-prod-us-west-2-pipeline-analysis"
DB_URI = "postgresql://{db_username}:{db_password}@{db_host}:{db_port}/{db_name}".format(**CONFIG)

# The default data set automatically expires data at
# three months. We also have sampled data sets that
@@ -69,14 +82,14 @@
{columns}
)
FROM '{s3_path}'
CREDENTIALS 'aws_access_key_id={aws_access_key_id};aws_secret_access_key={aws_secret_access_key}'
CREDENTIALS 'aws_access_key_id={AWS_ACCESS_KEY};aws_secret_access_key={AWS_SECRET_KEY}'
FORMAT AS CSV
TRUNCATECOLUMNS;
""".format(table=TABLE_NAMES["temp"],
columns="{columns}",
s3_path="{s3_path}",
aws_access_key_id="{aws_access_key_id}",
aws_secret_access_key="{aws_secret_access_key}")
AWS_ACCESS_KEY="{AWS_ACCESS_KEY}",
AWS_SECRET_KEY="{AWS_SECRET_KEY}")

Q_CLEAR_DAY = """
DELETE FROM {table}
@@ -178,7 +191,8 @@ def import_day(day):
db.run(Q_COPY_CSV.format(event_type=event_type,
columns=temp_columns,
s3_path=s3_path,
**CONFIG))
AWS_ACCESS_KEY=AWS_ACCESS_KEY,
AWS_SECRET_KEY=AWS_SECRET_KEY))
print_timestamp("MIN")
print_timestamp("MAX")
for rate in SAMPLE_RATES:

0 comments on commit be848b4

Please sign in to comment.