forked from dsaidgovsg/airflow-pipeline
-
Notifications
You must be signed in to change notification settings - Fork 0
/
docker-compose.emr.yml
110 lines (110 loc) · 3.95 KB
/
docker-compose.emr.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
version: '2.3'
services:
postgres:
hostname: postgres
image: postgres:9.6
ports:
- 127.0.0.1:5999:5432
restart: always
volumes:
- postgres:/var/lib/postgresql/data
environment:
POSTGRES_USER: fixme
POSTGRES_PASSWORD: fixme
POSTGRES_DB: airflow
scheduler:
build:
context: .
target: with-spark-optional-dag
dockerfile: Dockerfile
command: ['afp-scheduler']
depends_on:
- postgres
environment:
AIRFLOW__CORE__FERNET_KEY: ZZvP_7JEhKfQ7v5sg1DV1o4b1DcGv3Tu9vJXzEKkxbE=
AIRFLOW__CORE__SQL_ALCHEMY_CONN: postgresql://fixme:fixme@localhost:5999/airflow
# FOR S3 LOGGING, COMMENT OUT THE BELOW 3 LINES TO DISABLE S3 LOGGING
# S3_LOG_FOLDER FORMAT: s3://bucket/[directories]
AIRFLOW__CORE__TASK_LOG_READER: s3.task
AIRFLOW__CORE__LOGGING_CONFIG_CLASS: s3_log_config.LOGGING_CONFIG
S3_LOG_FOLDER: s3://fixme/
# IMPORTANT ENV VARS FOR EMR BELOW
SPARK_HOME: /usr/lib/spark
PYTHONPATH: /usr/lib/spark/python:/usr/lib/spark/python/lib/py4j-src.zip
SPARK_OPTS: "--conf spark.yarn.dist.files=file:/usr/lib/spark/python/lib/pyspark.zip,file:/usr/lib/spark/python/lib/py4j-src.zip
--conf spark.executorEnv.PYTHONPATH=pyspark.zip:py4j-src.zip
"
HADOOP_COMMON_LIB_NATIVE_DIR: null
HADOOP_HOME: null
HADOOP_CONF_DIR: null
HADOOP_COMMON_HOME: null
HADOOP_MAPRED_HOME: null
HADOOP_YARN_HOME: null
HADOOP_HTTPFS_HOME: null
HADOOP_HDFS_HOME: null
HADOOP_KMS_HOME: null
HADOOP_OPTS: null
HDFS_CONF_DIR: null
HIVE_CONF_DIR: null
YARN_CONF_DIR: null
PYSPARK_PYTHON: null
PYSPARK_SUBMIT_ARGS: null
SPARK_MASTER_OPTS: null
SPARK_MASTER_PORT: null
SPARK_MASTER_WEBUI_PORT: null
SPARK_PACKAGES: null
SPARK_WORKER_OPTS: null
SPARK_WORKER_PORT: null
SPARK_WORKER_WEBUI_PORT: null
# IMPORTANT ENV VARS FOR EMR ABOVE, OVERRIDE THEM IF NECESSARY OR ADD ADDITIONAL ENV VARS BELOW
hostname: scheduler
network_mode: host
restart: always
volumes:
- airflow_logs:/airflow/logs:rw
# IMPORTANT BIND MOUNTS FOR EMR BELOW
- /etc/alternatives/hadoop-conf:/etc/alternatives/hadoop-conf:ro
- /etc/alternatives/hive-conf:/etc/alternatives/hive-conf:ro
- /etc/alternatives/spark-conf:/etc/alternatives/spark-conf:ro
- /etc/spark/conf:/etc/spark/conf:ro
- /etc/hadoop/conf:/etc/hadoop/conf:ro
- /etc/hive/conf:/etc/hive/conf:ro
- /usr/bin/hdfs:/usr/bin/hdfs:ro
- /usr/bin/hadoop:/usr/bin/hadoop:ro
- /usr/bin/spark-shell:/usr/bin/spark-shell:ro
- /usr/bin/pyspark:/usr/bin/pyspark:ro
- /usr/lib/bigtop-utils:/usr/lib/bigtop-utils:ro
- /usr/lib/hadoop:/usr/lib/hadoop:ro
- /usr/lib/hadoop-hdfs:/usr/lib/hadoop-hdfs:ro
- /usr/lib/hadoop-kms:/usr/lib/hadoop-kms:ro
- /usr/lib/hadoop-httpfs:/usr/lib/hadoop-httpfs:ro
- /usr/lib/hadoop-lzo:/usr/lib/hadoop-lzo:ro
- /usr/lib/hadoop-yarn:/usr/lib/hadoop-yarn:ro
- /usr/lib/hadoop-mapreduce:/usr/lib/hadoop-mapreduce:ro
- /usr/lib/spark:/usr/lib/spark:ro
- /usr/share/aws:/usr/share/aws:ro
- /mnt/s3:/mnt/s3:rw
- /mnt/var/lib/hadoop/tmp:/mnt/var/lib/hadoop/tmp:rw
# IMPORTANT BIND MOUNTS FOR EMR ABOVE, OVERRIDE THEM IF NECESSARY OR ADD ADDITIONAL VOLUMES BELOW
webserver:
build:
context: .
target: with-spark-optional-dag
dockerfile: Dockerfile
command: ['afp-webserver']
depends_on:
- scheduler
environment:
AIRFLOW_EMAIL: fixme@data.gov.sg
AIRFLOW_PASSWORD: fixme
AIRFLOW_USER: fixme
AIRFLOW__CORE__FERNET_KEY: ZZvP_7JEhKfQ7v5sg1DV1o4b1DcGv3Tu9vJXzEKkxbE=
AIRFLOW__CORE__SQL_ALCHEMY_CONN: postgresql://fixme:fixme@postgres:5432/airflow
hostname: webserver
ports:
- 8888:8080
restart: always
volumes_from:
- scheduler
volumes:
airflow_logs: {}