Skip to content

Commit

Permalink
requirements*.txt support + docs
Browse files Browse the repository at this point in the history
Signed-off-by: Gaetan Semet <gaetan@xeberon.net>
  • Loading branch information
gsemet committed Oct 28, 2017
1 parent f84d458 commit 0c38fd8
Show file tree
Hide file tree
Showing 7 changed files with 171 additions and 30 deletions.
43 changes: 23 additions & 20 deletions Dockerfile.template
Expand Up @@ -14,6 +14,8 @@ ENV TERM linux
# Airflow
ARG AIRFLOW_VERSION=%%AIRFLOW_VERSION%%
ENV AIRFLOW_HOME /usr/local/airflow
ENV EMBEDDED_DAGS_LOCATION=%%EMBEDDED_DAGS_LOCATION%%
ENV REQUIREMENTS_TXT_LOCATION=%%REQUIREMENTS_TXT_LOCATION%%

# Define en_US.
ENV LANGUAGE en_US.UTF-8
Expand All @@ -23,44 +25,43 @@ ENV LC_CTYPE en_US.UTF-8
ENV LC_MESSAGES en_US.UTF-8
ENV LC_ALL en_US.UTF-8

WORKDIR /requirements
# Only copy needed files
COPY requirements/airflow.txt /requirements/airflow.txt
COPY ${REQUIREMENTS_TXT_LOCATION} /requirements/dags.txt


RUN set -ex \
&& buildDeps=' \
python3-pip \
python3-dev \
libkrb5-dev \
libsasl2-dev \
libxml2-dev \
libssl-dev \
libffi-dev \
build-essential \
libblas-dev \
libffi-dev \
libkrb5-dev \
liblapack-dev \
libpq-dev \
libsasl2-dev \
libssl-dev \
libxml2-dev \
libxslt1-dev \
python3-dev \
python3-pip \
zlib1g-dev \
' \
&& echo "deb http://http.debian.net/debian jessie-backports main" >/etc/apt/sources.list.d/backports.list \
&& apt-get update -yqq \
&& apt-get install -yqq --no-install-recommends \
$buildDeps \
apt-utils \
curl \
netcat \
git \
locales \
&& apt-get install -yqq -t jessie-backports libpq-dev git \
netcat \
&& sed -i 's/^# en_US.UTF-8 UTF-8$/en_US.UTF-8 UTF-8/g' /etc/locale.gen \
&& locale-gen \
&& update-locale LANG=en_US.UTF-8 LC_ALL=en_US.UTF-8 \
&& useradd -ms /bin/bash -d ${AIRFLOW_HOME} airflow \
&& pip3 install --upgrade pip enum34 'setuptools!=36.0.0' \
&& pip3 install pytz==2015.7 \
&& pip3 install cryptography \
&& pip3 install requests \
&& pip3 install pyOpenSSL \
&& pip3 install ndg-httpsclient \
&& pip3 install pyasn1 \
&& pip3 install psycopg2 \
&& pip3 install airflow[celery,postgresql,hive] \
&& pip3 install click \
&& pip3 install --upgrade pip 'setuptools!=36.0.0' \
&& pip3 install -r /requirements/airflow.txt \
&& pip3 install -r /requirements/dags.txt \
&& apt-get remove --purge -yqq $buildDeps libpq-dev \
&& apt-get clean \
&& rm -rf \
Expand All @@ -78,6 +79,8 @@ RUN curl -L -o /usr/local/bin/kubectl https://storage.googleapis.com/kubernetes-
COPY script/entrypoint.sh ${AIRFLOW_HOME}/entrypoint.sh
COPY config/airflow.cfg.in ${AIRFLOW_HOME}/airflow.cfg.in
COPY script/git-sync ${AIRFLOW_HOME}/git-sync
COPY ${EMBEDDED_DAGS_LOCATION} ${AIRFLOW_HOME}/dags
COPY script/git-sync ${AIRFLOW_HOME}/git-sync

RUN chown -R airflow: ${AIRFLOW_HOME} \
&& chmod +x ${AIRFLOW_HOME}/entrypoint.sh \
Expand Down
30 changes: 28 additions & 2 deletions Makefile
Expand Up @@ -12,13 +12,19 @@ DOCKERFILE ?= $(BUILD_ROOT)/Dockerfile
ROOTFS ?= $(BUILD_ROOT)/rootfs
AIRFLOW_CONF ?= $(BUILD_ROOT)/config/airflow.cfg.in
ENTRYPOINT_SH ?= $(BUILD_ROOT)/script/entrypoint.sh
GIT_SYNC ?= $(BUILD_ROOT)/script/git-sync
DAGS ?= $(BUILD_ROOT)/dags
AIRFLOW_REQUIREMENTS ?= $(BUILD_ROOT)/requirements/airflow.txt
DAGS_REQUIREMENTS ?= $(BUILD_ROOT)/requirements/dags.txt
DOCKER_CACHE ?= docker-cache
SAVED_IMAGE ?= $(DOCKER_CACHE)/image-$(AIRFLOW_VERSION)-$(KUBECTL_VERSION).tar

NAMESPACE ?= airflow-dev
HELM_APPLICATION_NAME ?= airflow
HELM_CONFIG ?= config.yaml
CHART_LOCATION ?= ./airflow
EMBEDDED_DAGS_LOCATION ?= "./dags"
REQUIREMENTS_TXT_LOCATION ?= "requirements/dags.txt"

.PHONY: build clean

Expand Down Expand Up @@ -46,14 +52,18 @@ helm-ls:
helm-uninstall:
helm del --purge $(HELM_APPLICATION_NAME)

build: $(DOCKERFILE) $(ROOTFS) $(AIRFLOW_CONF) $(ENTRYPOINT_SH)
build: clean $(DOCKERFILE) $(ROOTFS) $(DAGS) $(AIRFLOW_CONF) $(ENTRYPOINT_SH) $(GIT_SYNC) $(AIRFLOW_REQUIREMENTS) $(DAGS_REQUIREMENTS)
cd $(BUILD_ROOT) && docker build -t $(IMAGE) . && docker tag $(IMAGE) $(ALIAS)

publish:
docker push $(IMAGE) && docker push $(ALIAS)

$(DOCKERFILE): $(BUILD_ROOT)
sed -e 's/%%KUBECTL_VERSION%%/'"$(KUBECTL_VERSION)"'/g;' -e 's/%%AIRFLOW_VERSION%%/'"$(AIRFLOW_VERSION)"'/g;' Dockerfile.template > $(DOCKERFILE)
sed -e 's/%%KUBECTL_VERSION%%/'"$(KUBECTL_VERSION)"'/g;' \
-e 's/%%AIRFLOW_VERSION%%/'"$(AIRFLOW_VERSION)"'/g;' \
-e 's#%%EMBEDDED_DAGS_LOCATION%%#'"$(EMBEDDED_DAGS_LOCATION)"'#g;' \
-e 's#%%REQUIREMENTS_TXT_LOCATION%%#'"$(REQUIREMENTS_TXT_LOCATION)"'#g;' \
Dockerfile.template > $(DOCKERFILE)

$(ROOTFS): $(BUILD_ROOT)
mkdir -p rootfs
Expand All @@ -67,6 +77,22 @@ $(ENTRYPOINT_SH): $(BUILD_ROOT)
mkdir -p $(shell dirname $(ENTRYPOINT_SH))
cp script/entrypoint.sh $(ENTRYPOINT_SH)

$(GIT_SYNC): $(BUILD_ROOT)
mkdir -p $(shell dirname $(GIT_SYNC))
cp script/git-sync $(GIT_SYNC)

$(AIRFLOW_REQUIREMENTS): $(BUILD_ROOT)
mkdir -p $(shell dirname $(AIRFLOW_REQUIREMENTS))
cp requirements/airflow.txt $(AIRFLOW_REQUIREMENTS)

$(DAGS_REQUIREMENTS): $(BUILD_ROOT)
mkdir -p $(shell dirname $(DAGS_REQUIREMENTS))
cp $(REQUIREMENTS_TXT_LOCATION) $(DAGS_REQUIREMENTS)

$(DAGS): $(BUILD_ROOT)
mkdir -p $(shell dirname $(DAGS))
cp -R $(EMBEDDED_DAGS_LOCATION) $(DAGS)

$(BUILD_ROOT):
mkdir -p $(BUILD_ROOT)

Expand Down
69 changes: 64 additions & 5 deletions README.md
Expand Up @@ -79,11 +79,24 @@ Note:
Do NOT use characters such as " (double quote), ' (simple quote), / (slash) or \ (backslash)
in your passwords and prefix

### git-sync
### DAGs deployment: embedded DAGs or git-sync

This chart allows using git-sync to synchronize DAGs with a git project. While it is extremely cool
to see its DAG appears on Airflow 60s after merge on this project, you should be aware of some
limitation Airflow has with dynamic DAG updates
This chart provide basically two way of deploying DAGs in your Airflow installation:

- embedded DAGs
- Git-Sync

An enhancement can be to support Persistant Storage. If you are willing to contribute, do not
hesitate to do a Pull Request !

#### Using Git-Sync

Git-sync is the easiest way to automatically update your DAGs. It simply check periodially (by
default every minute) a Git project on a given branch and check this new version when available.
Scheduler and worker see changes almost real-time. There is no need to other tool and complex
rolling-update procedure.

While it is extremely cool to see its DAG appears on Airflow 60s after merge on this project, you should be aware of some limitations Airflow has with dynamic DAG updates:

If the scheduler reloads a dag in the middle of a dagrun then the dagrun will actually start
using the new version of the dag in the middle of execution.
Expand All @@ -94,13 +107,58 @@ like solution with airflow without:
- using explicit locking, ie never pull down a new dag if a dagrun is in progress
- make dags immutable, never modify your dag always make a new one

Also keep in mind using git-sync may not be scalable at all in production if you have lot of DAGs.
The best way to deploy you DAG is to build a new docker image containing all the DAG and their
dependencies. To do so, fork this project

#### Embedded DAGs

If you want more control on the way you deploy your DAGs, you can use embedded DAGs, where DAGs
are burned inside the Docker container deployed as Scheduler and Workers.

Be aware this requirement more heavy tooling than using git-sync, especially if you use CI/CD:

- your CI/CD should be able to build a new docker image each time your DAGs are updated.
- your CI/CD should be able to control the deployment of this new image in your kubernetes cluster

Example of procedure:
- Fork this project
- Place your DAG inside the `dags` folder of this project, update `requirements-dags.txt` to
install new dependencies if needed (see bellow)
- Add build script connected to your CI that will build the new docker image
- Deploy on your Kubernetes cluster

You can avoid forking this project by:

- keep a git-project dedicated to storing only your DAGs + dedicated `requirements.txt`
- you can gate any change to DAGs in your CI (unittest, `pip install -r requirements-dags.txt`,.. )
- have your CI/CD makes a new docker image after each successful merge using

DAG_PATH=$PWD
cd /path/to/kube-aiflow
make ENBEDDED_DAGS_LOCATION=$DAG_PATH

- trigger the deployment on this new image on your Kubernetes infrastructure

### Python dependencies

If you want to add specific python dependencies to use in your DAGs, you simply declare them inside
the `requirements/dags.txt` file. They will be automatically installed inside the container during
build, so you can directly use these library in your DAGs.

To use another file, call:

make REQUIREMENTS_TXT_LOCATION=/path/to/you/dags/requirements.txt

Please note this requires you set up the same tooling environment in your CI/CD that when using
Embedded DAGs.

### Helm configuration customization

Helm allow to overload the configuration to adapt to your environment. You probably want to specify
your own ingress configuration for instance.



## Build Docker image

`git clone` this repository and then just run:
Expand All @@ -111,6 +169,7 @@ your own ingress configuration for instance.

You can browse the Airflow dashboard via running:

minikube start
make browse-web

the Flower dashboard via running:
Expand Down
20 changes: 20 additions & 0 deletions dags/README.md
@@ -0,0 +1,20 @@
# Airflow DAGs

Place your DAGs in this folder if you choose to deploy your DAGS inside the Docker image.

This allows to enforce consistency between all the containers, and have the following advantages:
- force restart of the scheduler on DAG update, avoiding inconsistency issue in case of DAG update
while it is running
- allow installation of dependencies

It is also possible to actually keep your DAGs in an external git-project, combining it with
DAG requirements declaration, and call the kube-airflow's Makefile accordingly. For example, you can:

- gate every DAGs changes by a mergerequest mecanism: unittest, pip install,
- make a new docker image using

DAG_PATH=$PWD
cd /path/to/kube-aiflow
make ENBEDDED_DAGS_LOCATION=$DAG_PATH REQUIREMENTS_TXT_LOCATION=$DAG_PATH/requirements.txt

- trigger the deployment on this new image on your Kubernetes infrastructure
12 changes: 12 additions & 0 deletions requirements/airflow.txt
@@ -0,0 +1,12 @@
# This file contains the dependencies needed to install airflow
# To declare new dependencies, use `requirements-dags.txt`

pytz==2015.7
cryptography
requests
pyOpenSSL
ndg-httpsclient
pyasn1
psycopg2
airflow[celery,postgresql,hive]
click
15 changes: 15 additions & 0 deletions requirements/dags.txt
@@ -0,0 +1,15 @@
# Place in this file the Python requirements your DAG may need. They will be automatically
# installed inside the container during the container build, independently if you use embedded DAGs
# of Git-sync.

pytz==2015.7
cryptography
requests
pyOpenSSL
ndg-httpsclient
pyasn1
psycopg2
airflow[celery,postgresql,hive]
click

# add neew dependecies bellow:
12 changes: 9 additions & 3 deletions script/entrypoint.sh
Expand Up @@ -9,14 +9,17 @@ RABBITMQ_HOST="${RABBITMQ_HOST:-rabbitmq}"
RABBITMQ_CREDS="${RABBITMQ_CREDS:-airflow:airflow}"
RABBITMQ_MANAGEMENT_PORT=15672
FLOWER_URL_PREFIX="${FLOWER_URL_PREFIX:-/}"
LOAD_DAGS_EXAMPLES="${LOAD_DAGS_EXAMPLES:false}"
LOAD_DAGS_EXAMPLES="${LOAD_DAGS_EXAMPLES:-true}"
GIT_SYNC_REPO="${GIT_SYNC_REPO:-}"

if [ -z $FERNET_KEY ]; then
FERNET_KEY=$(python3 -c "from cryptography.fernet import Fernet; FERNET_KEY = Fernet.generate_key().decode(); print(FERNET_KEY))"
FERNET_KEY=$(python3 -c "from cryptography.fernet import Fernet; FERNET_KEY = Fernet.generate_key().decode(); print(FERNET_KEY)")
fi

echo "Postgres host: $POSTGRES_HOST"
echo "RabbitMQ host: $RABBITMQ_HOST"
echo "Load DAG examples: $LOAD_DAGS_EXAMPLES"
echo "Git sync repository: $GIT_SYNC_REPO"
echo

# Generate Fernet key
Expand All @@ -42,7 +45,7 @@ if [ "$1" = "webserver" ] || [ "$1" = "worker" ] || [ "$1" = "scheduler" ] || [
done
fi

# wait for DB
# wait for postgres
if [ "$1" = "webserver" ] || [ "$1" = "worker" ] || [ "$1" = "scheduler" ] ; then
i=0
while ! nc $POSTGRES_HOST $POSTGRES_PORT >/dev/null 2>&1 < /dev/null; do
Expand All @@ -62,6 +65,9 @@ if [ "$1" = "webserver" ] || [ "$1" = "worker" ] || [ "$1" = "scheduler" ] ; the
fi

if [ ! -z $GIT_SYNC_REPO ]; then
mkdir -p $AIRFLOW_HOME/dag
# remove possible embedded dags to avoid conflicts
rm -rf $AIRFLOW_HOME/dags/*
echo "Executing background task git-sync on repo $GIT_SYNC_REPO"
$AIRFLOW_HOME/git-sync --dest $AIRFLOW_HOME/dags &
fi
Expand Down

0 comments on commit 0c38fd8

Please sign in to comment.