From 0c38fd8f21635b08bb5b4e09c0b181b321555d56 Mon Sep 17 00:00:00 2001 From: Gaetan Semet Date: Sat, 28 Oct 2017 19:24:26 +0200 Subject: [PATCH] requirements*.txt support + docs Signed-off-by: Gaetan Semet --- Dockerfile.template | 43 +++++++++++++------------ Makefile | 30 +++++++++++++++-- README.md | 69 +++++++++++++++++++++++++++++++++++++--- dags/README.md | 20 ++++++++++++ requirements/airflow.txt | 12 +++++++ requirements/dags.txt | 15 +++++++++ script/entrypoint.sh | 12 +++++-- 7 files changed, 171 insertions(+), 30 deletions(-) create mode 100644 dags/README.md create mode 100644 requirements/airflow.txt create mode 100644 requirements/dags.txt diff --git a/Dockerfile.template b/Dockerfile.template index a6dd348..8d96bf9 100644 --- a/Dockerfile.template +++ b/Dockerfile.template @@ -14,6 +14,8 @@ ENV TERM linux # Airflow ARG AIRFLOW_VERSION=%%AIRFLOW_VERSION%% ENV AIRFLOW_HOME /usr/local/airflow +ENV EMBEDDED_DAGS_LOCATION=%%EMBEDDED_DAGS_LOCATION%% +ENV REQUIREMENTS_TXT_LOCATION=%%REQUIREMENTS_TXT_LOCATION%% # Define en_US. ENV LANGUAGE en_US.UTF-8 @@ -23,44 +25,43 @@ ENV LC_CTYPE en_US.UTF-8 ENV LC_MESSAGES en_US.UTF-8 ENV LC_ALL en_US.UTF-8 +WORKDIR /requirements +# Only copy needed files +COPY requirements/airflow.txt /requirements/airflow.txt +COPY ${REQUIREMENTS_TXT_LOCATION} /requirements/dags.txt + + RUN set -ex \ && buildDeps=' \ - python3-pip \ - python3-dev \ - libkrb5-dev \ - libsasl2-dev \ - libxml2-dev \ - libssl-dev \ - libffi-dev \ build-essential \ libblas-dev \ + libffi-dev \ + libkrb5-dev \ liblapack-dev \ + libpq-dev \ + libsasl2-dev \ + libssl-dev \ + libxml2-dev \ libxslt1-dev \ + python3-dev \ + python3-pip \ zlib1g-dev \ ' \ - && echo "deb http://http.debian.net/debian jessie-backports main" >/etc/apt/sources.list.d/backports.list \ && apt-get update -yqq \ && apt-get install -yqq --no-install-recommends \ $buildDeps \ apt-utils \ curl \ - netcat \ + git \ locales \ - && apt-get install -yqq -t jessie-backports libpq-dev git \ + netcat \ && sed -i 's/^# en_US.UTF-8 UTF-8$/en_US.UTF-8 UTF-8/g' /etc/locale.gen \ && locale-gen \ && update-locale LANG=en_US.UTF-8 LC_ALL=en_US.UTF-8 \ && useradd -ms /bin/bash -d ${AIRFLOW_HOME} airflow \ - && pip3 install --upgrade pip enum34 'setuptools!=36.0.0' \ - && pip3 install pytz==2015.7 \ - && pip3 install cryptography \ - && pip3 install requests \ - && pip3 install pyOpenSSL \ - && pip3 install ndg-httpsclient \ - && pip3 install pyasn1 \ - && pip3 install psycopg2 \ - && pip3 install airflow[celery,postgresql,hive] \ - && pip3 install click \ + && pip3 install --upgrade pip 'setuptools!=36.0.0' \ + && pip3 install -r /requirements/airflow.txt \ + && pip3 install -r /requirements/dags.txt \ && apt-get remove --purge -yqq $buildDeps libpq-dev \ && apt-get clean \ && rm -rf \ @@ -78,6 +79,8 @@ RUN curl -L -o /usr/local/bin/kubectl https://storage.googleapis.com/kubernetes- COPY script/entrypoint.sh ${AIRFLOW_HOME}/entrypoint.sh COPY config/airflow.cfg.in ${AIRFLOW_HOME}/airflow.cfg.in COPY script/git-sync ${AIRFLOW_HOME}/git-sync +COPY ${EMBEDDED_DAGS_LOCATION} ${AIRFLOW_HOME}/dags +COPY script/git-sync ${AIRFLOW_HOME}/git-sync RUN chown -R airflow: ${AIRFLOW_HOME} \ && chmod +x ${AIRFLOW_HOME}/entrypoint.sh \ diff --git a/Makefile b/Makefile index 5f19f2b..f9a03b2 100644 --- a/Makefile +++ b/Makefile @@ -12,6 +12,10 @@ DOCKERFILE ?= $(BUILD_ROOT)/Dockerfile ROOTFS ?= $(BUILD_ROOT)/rootfs AIRFLOW_CONF ?= $(BUILD_ROOT)/config/airflow.cfg.in ENTRYPOINT_SH ?= $(BUILD_ROOT)/script/entrypoint.sh +GIT_SYNC ?= $(BUILD_ROOT)/script/git-sync +DAGS ?= $(BUILD_ROOT)/dags +AIRFLOW_REQUIREMENTS ?= $(BUILD_ROOT)/requirements/airflow.txt +DAGS_REQUIREMENTS ?= $(BUILD_ROOT)/requirements/dags.txt DOCKER_CACHE ?= docker-cache SAVED_IMAGE ?= $(DOCKER_CACHE)/image-$(AIRFLOW_VERSION)-$(KUBECTL_VERSION).tar @@ -19,6 +23,8 @@ NAMESPACE ?= airflow-dev HELM_APPLICATION_NAME ?= airflow HELM_CONFIG ?= config.yaml CHART_LOCATION ?= ./airflow +EMBEDDED_DAGS_LOCATION ?= "./dags" +REQUIREMENTS_TXT_LOCATION ?= "requirements/dags.txt" .PHONY: build clean @@ -46,14 +52,18 @@ helm-ls: helm-uninstall: helm del --purge $(HELM_APPLICATION_NAME) -build: $(DOCKERFILE) $(ROOTFS) $(AIRFLOW_CONF) $(ENTRYPOINT_SH) +build: clean $(DOCKERFILE) $(ROOTFS) $(DAGS) $(AIRFLOW_CONF) $(ENTRYPOINT_SH) $(GIT_SYNC) $(AIRFLOW_REQUIREMENTS) $(DAGS_REQUIREMENTS) cd $(BUILD_ROOT) && docker build -t $(IMAGE) . && docker tag $(IMAGE) $(ALIAS) publish: docker push $(IMAGE) && docker push $(ALIAS) $(DOCKERFILE): $(BUILD_ROOT) - sed -e 's/%%KUBECTL_VERSION%%/'"$(KUBECTL_VERSION)"'/g;' -e 's/%%AIRFLOW_VERSION%%/'"$(AIRFLOW_VERSION)"'/g;' Dockerfile.template > $(DOCKERFILE) + sed -e 's/%%KUBECTL_VERSION%%/'"$(KUBECTL_VERSION)"'/g;' \ + -e 's/%%AIRFLOW_VERSION%%/'"$(AIRFLOW_VERSION)"'/g;' \ + -e 's#%%EMBEDDED_DAGS_LOCATION%%#'"$(EMBEDDED_DAGS_LOCATION)"'#g;' \ + -e 's#%%REQUIREMENTS_TXT_LOCATION%%#'"$(REQUIREMENTS_TXT_LOCATION)"'#g;' \ + Dockerfile.template > $(DOCKERFILE) $(ROOTFS): $(BUILD_ROOT) mkdir -p rootfs @@ -67,6 +77,22 @@ $(ENTRYPOINT_SH): $(BUILD_ROOT) mkdir -p $(shell dirname $(ENTRYPOINT_SH)) cp script/entrypoint.sh $(ENTRYPOINT_SH) +$(GIT_SYNC): $(BUILD_ROOT) + mkdir -p $(shell dirname $(GIT_SYNC)) + cp script/git-sync $(GIT_SYNC) + +$(AIRFLOW_REQUIREMENTS): $(BUILD_ROOT) + mkdir -p $(shell dirname $(AIRFLOW_REQUIREMENTS)) + cp requirements/airflow.txt $(AIRFLOW_REQUIREMENTS) + +$(DAGS_REQUIREMENTS): $(BUILD_ROOT) + mkdir -p $(shell dirname $(DAGS_REQUIREMENTS)) + cp $(REQUIREMENTS_TXT_LOCATION) $(DAGS_REQUIREMENTS) + +$(DAGS): $(BUILD_ROOT) + mkdir -p $(shell dirname $(DAGS)) + cp -R $(EMBEDDED_DAGS_LOCATION) $(DAGS) + $(BUILD_ROOT): mkdir -p $(BUILD_ROOT) diff --git a/README.md b/README.md index 6e35262..dda0842 100644 --- a/README.md +++ b/README.md @@ -79,11 +79,24 @@ Note: Do NOT use characters such as " (double quote), ' (simple quote), / (slash) or \ (backslash) in your passwords and prefix -### git-sync +### DAGs deployment: embedded DAGs or git-sync -This chart allows using git-sync to synchronize DAGs with a git project. While it is extremely cool -to see its DAG appears on Airflow 60s after merge on this project, you should be aware of some -limitation Airflow has with dynamic DAG updates +This chart provide basically two way of deploying DAGs in your Airflow installation: + +- embedded DAGs +- Git-Sync + +An enhancement can be to support Persistant Storage. If you are willing to contribute, do not +hesitate to do a Pull Request ! + +#### Using Git-Sync + +Git-sync is the easiest way to automatically update your DAGs. It simply check periodially (by +default every minute) a Git project on a given branch and check this new version when available. +Scheduler and worker see changes almost real-time. There is no need to other tool and complex +rolling-update procedure. + +While it is extremely cool to see its DAG appears on Airflow 60s after merge on this project, you should be aware of some limitations Airflow has with dynamic DAG updates: If the scheduler reloads a dag in the middle of a dagrun then the dagrun will actually start using the new version of the dag in the middle of execution. @@ -94,13 +107,58 @@ like solution with airflow without: - using explicit locking, ie never pull down a new dag if a dagrun is in progress - make dags immutable, never modify your dag always make a new one +Also keep in mind using git-sync may not be scalable at all in production if you have lot of DAGs. +The best way to deploy you DAG is to build a new docker image containing all the DAG and their +dependencies. To do so, fork this project + +#### Embedded DAGs + +If you want more control on the way you deploy your DAGs, you can use embedded DAGs, where DAGs +are burned inside the Docker container deployed as Scheduler and Workers. + +Be aware this requirement more heavy tooling than using git-sync, especially if you use CI/CD: + +- your CI/CD should be able to build a new docker image each time your DAGs are updated. +- your CI/CD should be able to control the deployment of this new image in your kubernetes cluster + +Example of procedure: +- Fork this project +- Place your DAG inside the `dags` folder of this project, update `requirements-dags.txt` to + install new dependencies if needed (see bellow) +- Add build script connected to your CI that will build the new docker image +- Deploy on your Kubernetes cluster + +You can avoid forking this project by: + +- keep a git-project dedicated to storing only your DAGs + dedicated `requirements.txt` +- you can gate any change to DAGs in your CI (unittest, `pip install -r requirements-dags.txt`,.. ) +- have your CI/CD makes a new docker image after each successful merge using + + DAG_PATH=$PWD + cd /path/to/kube-aiflow + make ENBEDDED_DAGS_LOCATION=$DAG_PATH + +- trigger the deployment on this new image on your Kubernetes infrastructure + +### Python dependencies + +If you want to add specific python dependencies to use in your DAGs, you simply declare them inside +the `requirements/dags.txt` file. They will be automatically installed inside the container during +build, so you can directly use these library in your DAGs. + +To use another file, call: + + make REQUIREMENTS_TXT_LOCATION=/path/to/you/dags/requirements.txt + +Please note this requires you set up the same tooling environment in your CI/CD that when using +Embedded DAGs. + ### Helm configuration customization Helm allow to overload the configuration to adapt to your environment. You probably want to specify your own ingress configuration for instance. - ## Build Docker image `git clone` this repository and then just run: @@ -111,6 +169,7 @@ your own ingress configuration for instance. You can browse the Airflow dashboard via running: + minikube start make browse-web the Flower dashboard via running: diff --git a/dags/README.md b/dags/README.md new file mode 100644 index 0000000..94b3ad7 --- /dev/null +++ b/dags/README.md @@ -0,0 +1,20 @@ +# Airflow DAGs + +Place your DAGs in this folder if you choose to deploy your DAGS inside the Docker image. + +This allows to enforce consistency between all the containers, and have the following advantages: +- force restart of the scheduler on DAG update, avoiding inconsistency issue in case of DAG update + while it is running +- allow installation of dependencies + +It is also possible to actually keep your DAGs in an external git-project, combining it with +DAG requirements declaration, and call the kube-airflow's Makefile accordingly. For example, you can: + +- gate every DAGs changes by a mergerequest mecanism: unittest, pip install, +- make a new docker image using + + DAG_PATH=$PWD + cd /path/to/kube-aiflow + make ENBEDDED_DAGS_LOCATION=$DAG_PATH REQUIREMENTS_TXT_LOCATION=$DAG_PATH/requirements.txt + +- trigger the deployment on this new image on your Kubernetes infrastructure diff --git a/requirements/airflow.txt b/requirements/airflow.txt new file mode 100644 index 0000000..e3721db --- /dev/null +++ b/requirements/airflow.txt @@ -0,0 +1,12 @@ +# This file contains the dependencies needed to install airflow +# To declare new dependencies, use `requirements-dags.txt` + +pytz==2015.7 +cryptography +requests +pyOpenSSL +ndg-httpsclient +pyasn1 +psycopg2 +airflow[celery,postgresql,hive] +click diff --git a/requirements/dags.txt b/requirements/dags.txt new file mode 100644 index 0000000..757dd54 --- /dev/null +++ b/requirements/dags.txt @@ -0,0 +1,15 @@ +# Place in this file the Python requirements your DAG may need. They will be automatically +# installed inside the container during the container build, independently if you use embedded DAGs +# of Git-sync. + +pytz==2015.7 +cryptography +requests +pyOpenSSL +ndg-httpsclient +pyasn1 +psycopg2 +airflow[celery,postgresql,hive] +click + +# add neew dependecies bellow: diff --git a/script/entrypoint.sh b/script/entrypoint.sh index ff29d4c..6910091 100644 --- a/script/entrypoint.sh +++ b/script/entrypoint.sh @@ -9,14 +9,17 @@ RABBITMQ_HOST="${RABBITMQ_HOST:-rabbitmq}" RABBITMQ_CREDS="${RABBITMQ_CREDS:-airflow:airflow}" RABBITMQ_MANAGEMENT_PORT=15672 FLOWER_URL_PREFIX="${FLOWER_URL_PREFIX:-/}" -LOAD_DAGS_EXAMPLES="${LOAD_DAGS_EXAMPLES:false}" +LOAD_DAGS_EXAMPLES="${LOAD_DAGS_EXAMPLES:-true}" +GIT_SYNC_REPO="${GIT_SYNC_REPO:-}" if [ -z $FERNET_KEY ]; then - FERNET_KEY=$(python3 -c "from cryptography.fernet import Fernet; FERNET_KEY = Fernet.generate_key().decode(); print(FERNET_KEY))" + FERNET_KEY=$(python3 -c "from cryptography.fernet import Fernet; FERNET_KEY = Fernet.generate_key().decode(); print(FERNET_KEY)") fi echo "Postgres host: $POSTGRES_HOST" echo "RabbitMQ host: $RABBITMQ_HOST" +echo "Load DAG examples: $LOAD_DAGS_EXAMPLES" +echo "Git sync repository: $GIT_SYNC_REPO" echo # Generate Fernet key @@ -42,7 +45,7 @@ if [ "$1" = "webserver" ] || [ "$1" = "worker" ] || [ "$1" = "scheduler" ] || [ done fi -# wait for DB +# wait for postgres if [ "$1" = "webserver" ] || [ "$1" = "worker" ] || [ "$1" = "scheduler" ] ; then i=0 while ! nc $POSTGRES_HOST $POSTGRES_PORT >/dev/null 2>&1 < /dev/null; do @@ -62,6 +65,9 @@ if [ "$1" = "webserver" ] || [ "$1" = "worker" ] || [ "$1" = "scheduler" ] ; the fi if [ ! -z $GIT_SYNC_REPO ]; then + mkdir -p $AIRFLOW_HOME/dag + # remove possible embedded dags to avoid conflicts + rm -rf $AIRFLOW_HOME/dags/* echo "Executing background task git-sync on repo $GIT_SYNC_REPO" $AIRFLOW_HOME/git-sync --dest $AIRFLOW_HOME/dags & fi