From abab94c9676bdf2968c7126806b707d4d23b5dc0 Mon Sep 17 00:00:00 2001 From: Shivaram Venkataraman Date: Tue, 21 Oct 2014 12:46:55 -0700 Subject: [PATCH 01/11] Add support for Hadoop YARN 2.4 --- ephemeral-hdfs/init.sh | 12 ++++++ ephemeral-hdfs/setup-slave.sh | 4 ++ ephemeral-hdfs/setup.sh | 21 ++++++++-- mapreduce/init.sh | 3 ++ spark/init.sh | 4 +- .../root/ephemeral-hdfs/conf/core-site.xml | 5 +++ .../root/ephemeral-hdfs/conf/yarn-site.xml | 40 +++++++++++++++++++ templates/root/spark/conf/spark-env.sh | 3 ++ 8 files changed, 88 insertions(+), 4 deletions(-) create mode 100644 templates/root/ephemeral-hdfs/conf/yarn-site.xml diff --git a/ephemeral-hdfs/init.sh b/ephemeral-hdfs/init.sh index e618aa5..a57d7f3 100755 --- a/ephemeral-hdfs/init.sh +++ b/ephemeral-hdfs/init.sh @@ -23,6 +23,18 @@ case "$HADOOP_MAJOR_VERSION" in rm hadoop-*.tar.gz mv hadoop-2.0.0-cdh4.2.0/ ephemeral-hdfs/ + # Have single conf dir + rm -rf /root/ephemeral-hdfs/etc/hadoop/ + ln -s /root/ephemeral-hdfs/conf /root/ephemeral-hdfs/etc/hadoop + ;; + yarn) + # TODO: Replace this with s3 bucket + wget http://www.us.apache.org/dist/hadoop/common/hadoop-2.4.0/hadoop-2.4.0.tar.gz + echo "Unpacking Hadoop" + tar xvzf hadoop-*.tar.gz > /tmp/spark-ec2_hadoop.log + rm hadoop-*.tar.gz + mv hadoop-2.4.0/ ephemeral-hdfs/ + # Have single conf dir rm -rf /root/ephemeral-hdfs/etc/hadoop/ ln -s /root/ephemeral-hdfs/conf /root/ephemeral-hdfs/etc/hadoop diff --git a/ephemeral-hdfs/setup-slave.sh b/ephemeral-hdfs/setup-slave.sh index 23871d2..a85c4df 100755 --- a/ephemeral-hdfs/setup-slave.sh +++ b/ephemeral-hdfs/setup-slave.sh @@ -4,6 +4,10 @@ mkdir -p /mnt/ephemeral-hdfs/logs mkdir -p /mnt/hadoop-logs +# Setup yarn logs, local dirs +mkdir -p /mnt/yarn-local +mkdir -p /mnt/yarn-logs + # Create Hadoop and HDFS directories in a given parent directory # (for example /mnt, /mnt2, and so on) function create_hadoop_dirs { diff --git a/ephemeral-hdfs/setup.sh b/ephemeral-hdfs/setup.sh index d08b754..2dbc1a5 100755 --- a/ephemeral-hdfs/setup.sh +++ b/ephemeral-hdfs/setup.sh @@ -27,8 +27,23 @@ else fi echo "Starting ephemeral HDFS..." -# This is different depending on version. Simple hack: just try both. -$EPHEMERAL_HDFS/sbin/start-dfs.sh -$EPHEMERAL_HDFS/bin/start-dfs.sh + +# This is different depending on version. +case "$HADOOP_MAJOR_VERSION" in + 1) + $EPHEMERAL_HDFS/bin/start-dfs.sh + ;; + 2) + $EPHEMERAL_HDFS/sbin/start-dfs.sh + ;; + yarn) + $EPHEMERAL_HDFS/sbin/start-dfs.sh + echo "Starting YARN" + $EPHEMERAL_HDFS/sbin/start-yarn.sh + ;; + *) + echo "ERROR: Unknown Hadoop version" + return -1 +esac popd diff --git a/mapreduce/init.sh b/mapreduce/init.sh index 8f5ce3f..297d6bb 100755 --- a/mapreduce/init.sh +++ b/mapreduce/init.sh @@ -11,6 +11,9 @@ case "$HADOOP_MAJOR_VERSION" in rm mr1-*.tar.gz mv hadoop-2.0.0-mr1-cdh4.2.0/ mapreduce/ ;; + 2.4-yarn) + echo "Nothing to initialize for MapReduce in Hadoop 2 YARN" + ;; *) echo "ERROR: Unknown Hadoop version" diff --git a/spark/init.sh b/spark/init.sh index 4f1bb38..eca76a7 100755 --- a/spark/init.sh +++ b/spark/init.sh @@ -91,8 +91,10 @@ else 1.1.0) if [[ "$HADOOP_MAJOR_VERSION" == "1" ]]; then wget http://s3.amazonaws.com/spark-related-packages/spark-1.1.0-bin-hadoop1.tgz - else + else if [[ "$HADOOP_MAJOR_VERSION" == "2" ]]; then wget http://s3.amazonaws.com/spark-related-packages/spark-1.1.0-bin-cdh4.tgz + else + wget http://s3.amazonaws.com/spark-related-packages/spark-1.1.0-bin-hadoop2.4.tgz fi ;; *) diff --git a/templates/root/ephemeral-hdfs/conf/core-site.xml b/templates/root/ephemeral-hdfs/conf/core-site.xml index 190e594..2aad2aa 100644 --- a/templates/root/ephemeral-hdfs/conf/core-site.xml +++ b/templates/root/ephemeral-hdfs/conf/core-site.xml @@ -15,6 +15,11 @@ hdfs://{{active_master}}:9000 + + fs.defaultFS + hdfs://{{active_master}}:9000 + + io.file.buffer.size 65536 diff --git a/templates/root/ephemeral-hdfs/conf/yarn-site.xml b/templates/root/ephemeral-hdfs/conf/yarn-site.xml new file mode 100644 index 0000000..5abaec6 --- /dev/null +++ b/templates/root/ephemeral-hdfs/conf/yarn-site.xml @@ -0,0 +1,40 @@ + + + + + + + + yarn.resourcemanager.hostname + {{active_master}} + + + + + yarn.nodemanager.local-dirs + /mnt/yarn-local + + + + yarn.nodemanager.log-dirs + /mnt/yarn-logs + + + + yarn.log-aggregation-enable + true + + + diff --git a/templates/root/spark/conf/spark-env.sh b/templates/root/spark/conf/spark-env.sh index fd57334..27fd003 100755 --- a/templates/root/spark/conf/spark-env.sh +++ b/templates/root/spark/conf/spark-env.sh @@ -17,5 +17,8 @@ export SPARK_SUBMIT_CLASSPATH="$SPARK_CLASSPATH:$SPARK_SUBMIT_CLASSPATH:/root/ep # Bind Spark's web UIs to this machine's public EC2 hostname: export SPARK_PUBLIC_DNS=`wget -q -O - http://169.254.169.254/latest/meta-data/public-hostname` +# Used for YARN model +export YARN_CONF_DIR="/root/ephemeral-hdfs/conf" + # Set a high ulimit for large shuffles ulimit -n 1000000 From 99805d95d55286e96426e358f141b9ddd127c8bd Mon Sep 17 00:00:00 2001 From: Shivaram Venkataraman Date: Tue, 21 Oct 2014 12:47:26 -0700 Subject: [PATCH 02/11] Add yarn-env.sh --- .../root/ephemeral-hdfs/conf/yarn-env.sh | 126 ++++++++++++++++++ 1 file changed, 126 insertions(+) create mode 100644 templates/root/ephemeral-hdfs/conf/yarn-env.sh diff --git a/templates/root/ephemeral-hdfs/conf/yarn-env.sh b/templates/root/ephemeral-hdfs/conf/yarn-env.sh new file mode 100644 index 0000000..77e6219 --- /dev/null +++ b/templates/root/ephemeral-hdfs/conf/yarn-env.sh @@ -0,0 +1,126 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# User for YARN daemons +export HADOOP_YARN_USER=${HADOOP_YARN_USER:-yarn} + +# resolve links - $0 may be a softlink +#export YARN_CONF_DIR="${YARN_CONF_DIR:-$HADOOP_YARN_HOME/conf}" +export YARN_CONF_DIR="/root/ephemeral-hdfs/conf" + +# some Java parameters +# export JAVA_HOME=/home/y/libexec/jdk1.6.0/ +#if [ "$JAVA_HOME" != "" ]; then +# #echo "run java in $JAVA_HOME" +# JAVA_HOME=$JAVA_HOME +#fi +# +#if [ "$JAVA_HOME" = "" ]; then +# echo "Error: JAVA_HOME is not set." +# exit 1 +#fi + +export JAVA_HOME={{java_home}} + +JAVA=$JAVA_HOME/bin/java +JAVA_HEAP_MAX=-Xmx1000m + +# For setting YARN specific HEAP sizes please use this +# Parameter and set appropriately +export YARN_HEAPSIZE=1000 + +# check envvars which might override default args +if [ "$YARN_HEAPSIZE" != "" ]; then + JAVA_HEAP_MAX="-Xmx""$YARN_HEAPSIZE""m" +fi + +# Resource Manager specific parameters + +# Specify the max Heapsize for the ResourceManager using a numerical value +# in the scale of MB. For example, to specify an jvm option of -Xmx1000m, set +# the value to 1000. +# This value will be overridden by an Xmx setting specified in either YARN_OPTS +# and/or YARN_RESOURCEMANAGER_OPTS. +# If not specified, the default value will be picked from either YARN_HEAPMAX +# or JAVA_HEAP_MAX with YARN_HEAPMAX as the preferred option of the two. +#export YARN_RESOURCEMANAGER_HEAPSIZE=1000 + +# Specify the max Heapsize for the timeline server using a numerical value +# in the scale of MB. For example, to specify an jvm option of -Xmx1000m, set +# the value to 1000. +# This value will be overridden by an Xmx setting specified in either YARN_OPTS +# and/or YARN_TIMELINESERVER_OPTS. +# If not specified, the default value will be picked from either YARN_HEAPMAX +# or JAVA_HEAP_MAX with YARN_HEAPMAX as the preferred option of the two. +#export YARN_TIMELINESERVER_HEAPSIZE=1000 + +# Specify the JVM options to be used when starting the ResourceManager. +# These options will be appended to the options specified as YARN_OPTS +# and therefore may override any similar flags set in YARN_OPTS +#export YARN_RESOURCEMANAGER_OPTS= + +# Node Manager specific parameters + +# Specify the max Heapsize for the NodeManager using a numerical value +# in the scale of MB. For example, to specify an jvm option of -Xmx1000m, set +# the value to 1000. +# This value will be overridden by an Xmx setting specified in either YARN_OPTS +# and/or YARN_NODEMANAGER_OPTS. +# If not specified, the default value will be picked from either YARN_HEAPMAX +# or JAVA_HEAP_MAX with YARN_HEAPMAX as the preferred option of the two. +#export YARN_NODEMANAGER_HEAPSIZE=1000 + +# Specify the JVM options to be used when starting the NodeManager. +# These options will be appended to the options specified as YARN_OPTS +# and therefore may override any similar flags set in YARN_OPTS +#export YARN_NODEMANAGER_OPTS= + +# so that filenames w/ spaces are handled correctly in loops below +IFS= + + +# default log directory & file +#if [ "$YARN_LOG_DIR" = "" ]; then +# YARN_LOG_DIR="$HADOOP_YARN_HOME/logs" +#fi +export YARN_LOG_DIR=/mnt/ephemeral-hdfs/logs + +if [ "$YARN_LOGFILE" = "" ]; then + YARN_LOGFILE='yarn.log' +fi + +# default policy file for service-level authorization +if [ "$YARN_POLICYFILE" = "" ]; then + YARN_POLICYFILE="hadoop-policy.xml" +fi + +# restore ordinary behaviour +unset IFS + + +YARN_OPTS="$YARN_OPTS -Dhadoop.log.dir=$YARN_LOG_DIR" +YARN_OPTS="$YARN_OPTS -Dyarn.log.dir=$YARN_LOG_DIR" +YARN_OPTS="$YARN_OPTS -Dhadoop.log.file=$YARN_LOGFILE" +YARN_OPTS="$YARN_OPTS -Dyarn.log.file=$YARN_LOGFILE" +YARN_OPTS="$YARN_OPTS -Dyarn.home.dir=$YARN_COMMON_HOME" +YARN_OPTS="$YARN_OPTS -Dyarn.id.str=$YARN_IDENT_STRING" +YARN_OPTS="$YARN_OPTS -Dhadoop.root.logger=${YARN_ROOT_LOGGER:-INFO,console}" +YARN_OPTS="$YARN_OPTS -Dyarn.root.logger=${YARN_ROOT_LOGGER:-INFO,console}" +if [ "x$JAVA_LIBRARY_PATH" != "x" ]; then + YARN_OPTS="$YARN_OPTS -Djava.library.path=$JAVA_LIBRARY_PATH" +fi +YARN_OPTS="$YARN_OPTS -Dyarn.policy.file=$YARN_POLICYFILE" + + From caa969079254830824c77fc739c3771997143ed1 Mon Sep 17 00:00:00 2001 From: Shivaram Venkataraman Date: Tue, 21 Oct 2014 13:00:57 -0700 Subject: [PATCH 03/11] Fix Hadoop 2.4 url --- ephemeral-hdfs/init.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ephemeral-hdfs/init.sh b/ephemeral-hdfs/init.sh index a57d7f3..f229b05 100755 --- a/ephemeral-hdfs/init.sh +++ b/ephemeral-hdfs/init.sh @@ -29,7 +29,7 @@ case "$HADOOP_MAJOR_VERSION" in ;; yarn) # TODO: Replace this with s3 bucket - wget http://www.us.apache.org/dist/hadoop/common/hadoop-2.4.0/hadoop-2.4.0.tar.gz + wget http://mirror.cogentco.com/pub/apache/hadoop/common/hadoop-2.4.0/hadoop-2.4.0.tar.gz echo "Unpacking Hadoop" tar xvzf hadoop-*.tar.gz > /tmp/spark-ec2_hadoop.log rm hadoop-*.tar.gz From 1a9b25c51a97ee82cdb49bcfea2104165f632941 Mon Sep 17 00:00:00 2001 From: Shivaram Venkataraman Date: Tue, 21 Oct 2014 13:02:42 -0700 Subject: [PATCH 04/11] Fix hadoop version --- mapreduce/init.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mapreduce/init.sh b/mapreduce/init.sh index 297d6bb..b6f39a1 100755 --- a/mapreduce/init.sh +++ b/mapreduce/init.sh @@ -11,7 +11,7 @@ case "$HADOOP_MAJOR_VERSION" in rm mr1-*.tar.gz mv hadoop-2.0.0-mr1-cdh4.2.0/ mapreduce/ ;; - 2.4-yarn) + yarn) echo "Nothing to initialize for MapReduce in Hadoop 2 YARN" ;; From 67669c050444c9ed7b02b3137df088c170be07a8 Mon Sep 17 00:00:00 2001 From: Shivaram Venkataraman Date: Tue, 21 Oct 2014 13:08:41 -0700 Subject: [PATCH 05/11] Use elif --- spark/init.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spark/init.sh b/spark/init.sh index eca76a7..2a6a5c3 100755 --- a/spark/init.sh +++ b/spark/init.sh @@ -91,7 +91,7 @@ else 1.1.0) if [[ "$HADOOP_MAJOR_VERSION" == "1" ]]; then wget http://s3.amazonaws.com/spark-related-packages/spark-1.1.0-bin-hadoop1.tgz - else if [[ "$HADOOP_MAJOR_VERSION" == "2" ]]; then + elif [[ "$HADOOP_MAJOR_VERSION" == "2" ]]; then wget http://s3.amazonaws.com/spark-related-packages/spark-1.1.0-bin-cdh4.tgz else wget http://s3.amazonaws.com/spark-related-packages/spark-1.1.0-bin-hadoop2.4.tgz From c6b81d38626918bf2f13af3e27155b403c6e3858 Mon Sep 17 00:00:00 2001 From: Shivaram Venkataraman Date: Tue, 21 Oct 2014 13:18:30 -0700 Subject: [PATCH 06/11] Workaround lack of hadoop-native security libraries --- templates/root/ephemeral-hdfs/conf/core-site.xml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/templates/root/ephemeral-hdfs/conf/core-site.xml b/templates/root/ephemeral-hdfs/conf/core-site.xml index 2aad2aa..65f030d 100644 --- a/templates/root/ephemeral-hdfs/conf/core-site.xml +++ b/templates/root/ephemeral-hdfs/conf/core-site.xml @@ -60,4 +60,9 @@ {{aws_secret_access_key}} + + hadoop.security.group.mapping + org.apache.hadoop.security.ShellBasedUnixGroupsMapping + + From ee86a8919da2646014d18d91646b2605a355eb2e Mon Sep 17 00:00:00 2001 From: Shivaram Venkataraman Date: Tue, 21 Oct 2014 13:59:46 -0700 Subject: [PATCH 07/11] Add memory limits and capacity scheduler conf --- deploy_templates.py | 1 + .../conf/capacity-scheduler.xml | 111 ++++++++++++++++++ .../root/ephemeral-hdfs/conf/yarn-site.xml | 10 ++ 3 files changed, 122 insertions(+) create mode 100644 templates/root/ephemeral-hdfs/conf/capacity-scheduler.xml diff --git a/deploy_templates.py b/deploy_templates.py index 2de3264..2801fdd 100755 --- a/deploy_templates.py +++ b/deploy_templates.py @@ -69,6 +69,7 @@ "hadoop_major_version": os.getenv("HADOOP_MAJOR_VERSION"), "java_home": os.getenv("JAVA_HOME"), "default_tachyon_mem": "%dMB" % tachyon_mb, + "system_ram_mb": "%d" % system_ram_mb, "aws_access_key_id": os.getenv("AWS_ACCESS_KEY_ID"), "aws_secret_access_key": os.getenv("AWS_SECRET_ACCESS_KEY"), } diff --git a/templates/root/ephemeral-hdfs/conf/capacity-scheduler.xml b/templates/root/ephemeral-hdfs/conf/capacity-scheduler.xml new file mode 100644 index 0000000..2bed464 --- /dev/null +++ b/templates/root/ephemeral-hdfs/conf/capacity-scheduler.xml @@ -0,0 +1,111 @@ + + + + + yarn.scheduler.capacity.maximum-applications + 10000 + + Maximum number of applications that can be pending and running. + + + + + yarn.scheduler.capacity.maximum-am-resource-percent + 0.1 + + Maximum percent of resources in the cluster which can be used to run + application masters i.e. controls number of concurrent running + applications. + + + + + yarn.scheduler.capacity.resource-calculator + org.apache.hadoop.yarn.util.resource.DefaultResourceCalculator + + The ResourceCalculator implementation to be used to compare + Resources in the scheduler. + The default i.e. DefaultResourceCalculator only uses Memory while + DominantResourceCalculator uses dominant-resource to compare + multi-dimensional resources such as Memory, CPU etc. + + + + + yarn.scheduler.capacity.root.queues + default + + The queues at the this level (root is the root queue). + + + + + yarn.scheduler.capacity.root.default.capacity + 100 + Default queue target capacity. + + + + yarn.scheduler.capacity.root.default.user-limit-factor + 1 + + Default queue user limit a percentage from 0.0 to 1.0. + + + + + yarn.scheduler.capacity.root.default.maximum-capacity + 100 + + The maximum capacity of the default queue. + + + + + yarn.scheduler.capacity.root.default.state + RUNNING + + The state of the default queue. State can be one of RUNNING or STOPPED. + + + + + yarn.scheduler.capacity.root.default.acl_submit_applications + * + + The ACL of who can submit jobs to the default queue. + + + + + yarn.scheduler.capacity.root.default.acl_administer_queue + * + + The ACL of who can administer jobs on the default queue. + + + + + yarn.scheduler.capacity.node-locality-delay + 40 + + Number of missed scheduling opportunities after which the CapacityScheduler + attempts to schedule rack-local containers. + Typically this should be set to number of nodes in the cluster, By default is setting + approximately number of nodes in one rack which is 40. + + + + diff --git a/templates/root/ephemeral-hdfs/conf/yarn-site.xml b/templates/root/ephemeral-hdfs/conf/yarn-site.xml index 5abaec6..fe116de 100644 --- a/templates/root/ephemeral-hdfs/conf/yarn-site.xml +++ b/templates/root/ephemeral-hdfs/conf/yarn-site.xml @@ -37,4 +37,14 @@ true + + yarn.scheduler.maximum-allocation-mb + {{system_ram_mb}} + + + + yarn.nodemanager.resource.memory-mb + {{system_ram_mb}} + + From c5398b40b5734494b80b3596b8306630d8fffd12 Mon Sep 17 00:00:00 2001 From: Shivaram Venkataraman Date: Tue, 21 Oct 2014 14:05:40 -0700 Subject: [PATCH 08/11] Use S3 URL --- ephemeral-hdfs/init.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ephemeral-hdfs/init.sh b/ephemeral-hdfs/init.sh index f229b05..fb5496c 100755 --- a/ephemeral-hdfs/init.sh +++ b/ephemeral-hdfs/init.sh @@ -29,7 +29,7 @@ case "$HADOOP_MAJOR_VERSION" in ;; yarn) # TODO: Replace this with s3 bucket - wget http://mirror.cogentco.com/pub/apache/hadoop/common/hadoop-2.4.0/hadoop-2.4.0.tar.gz + wget http://s3.amazonaws.com/spark-related-packages/hadoop-2.4.0.tar.gz echo "Unpacking Hadoop" tar xvzf hadoop-*.tar.gz > /tmp/spark-ec2_hadoop.log rm hadoop-*.tar.gz From 6a5d932734d94588cf6f20c4744313a8876faaa1 Mon Sep 17 00:00:00 2001 From: Shivaram Venkataraman Date: Tue, 21 Oct 2014 14:38:15 -0700 Subject: [PATCH 09/11] Changes to get mapred to work with YARN --- templates/root/ephemeral-hdfs/conf/mapred-site.xml | 5 +++++ templates/root/ephemeral-hdfs/conf/yarn-site.xml | 10 ++++++++++ 2 files changed, 15 insertions(+) diff --git a/templates/root/ephemeral-hdfs/conf/mapred-site.xml b/templates/root/ephemeral-hdfs/conf/mapred-site.xml index b1637dc..3efb463 100644 --- a/templates/root/ephemeral-hdfs/conf/mapred-site.xml +++ b/templates/root/ephemeral-hdfs/conf/mapred-site.xml @@ -5,6 +5,11 @@ + + mapreduce.framework.name + yarn + + mapred.job.tracker {{active_master}}:9001 diff --git a/templates/root/ephemeral-hdfs/conf/yarn-site.xml b/templates/root/ephemeral-hdfs/conf/yarn-site.xml index fe116de..2d01ebb 100644 --- a/templates/root/ephemeral-hdfs/conf/yarn-site.xml +++ b/templates/root/ephemeral-hdfs/conf/yarn-site.xml @@ -47,4 +47,14 @@ {{system_ram_mb}} + + yarn.nodemanager.aux-services + mapreduce_shuffle + + + + yarn.nodemanager.aux-services.mapreduce.shuffle.class + org.apache.hadoop.mapred.ShuffleHandler + + From 61fa9e64757c7326b0f4914494cfcdfd36a44091 Mon Sep 17 00:00:00 2001 From: Shivaram Venkataraman Date: Tue, 21 Oct 2014 14:39:13 -0700 Subject: [PATCH 10/11] Remove obsolete TODO --- ephemeral-hdfs/init.sh | 1 - 1 file changed, 1 deletion(-) diff --git a/ephemeral-hdfs/init.sh b/ephemeral-hdfs/init.sh index fb5496c..99fdffa 100755 --- a/ephemeral-hdfs/init.sh +++ b/ephemeral-hdfs/init.sh @@ -28,7 +28,6 @@ case "$HADOOP_MAJOR_VERSION" in ln -s /root/ephemeral-hdfs/conf /root/ephemeral-hdfs/etc/hadoop ;; yarn) - # TODO: Replace this with s3 bucket wget http://s3.amazonaws.com/spark-related-packages/hadoop-2.4.0.tar.gz echo "Unpacking Hadoop" tar xvzf hadoop-*.tar.gz > /tmp/spark-ec2_hadoop.log From d78b7202cc708951e7e6a225a90dda2dec9a0f7e Mon Sep 17 00:00:00 2001 From: Shivaram Venkataraman Date: Tue, 21 Oct 2014 14:47:05 -0700 Subject: [PATCH 11/11] Add persistent HDFS based on YARN --- persistent-hdfs/init.sh | 11 +++++++++++ templates/root/persistent-hdfs/conf/core-site.xml | 5 +++++ 2 files changed, 16 insertions(+) diff --git a/persistent-hdfs/init.sh b/persistent-hdfs/init.sh index 636a043..690dd93 100755 --- a/persistent-hdfs/init.sh +++ b/persistent-hdfs/init.sh @@ -22,6 +22,17 @@ case "$HADOOP_MAJOR_VERSION" in rm hadoop-*.tar.gz mv hadoop-2.0.0-cdh4.2.0/ persistent-hdfs/ + # Have single conf dir + rm -rf /root/persistent-hdfs/etc/hadoop/ + ln -s /root/persistent-hdfs/conf /root/persistent-hdfs/etc/hadoop + ;; + yarn) + wget http://s3.amazonaws.com/spark-related-packages/hadoop-2.4.0.tar.gz + echo "Unpacking Hadoop" + tar xvzf hadoop-*.tar.gz > /tmp/spark-ec2_hadoop.log + rm hadoop-*.tar.gz + mv hadoop-2.4.0/ persistent-hdfs/ + # Have single conf dir rm -rf /root/persistent-hdfs/etc/hadoop/ ln -s /root/persistent-hdfs/conf /root/persistent-hdfs/etc/hadoop diff --git a/templates/root/persistent-hdfs/conf/core-site.xml b/templates/root/persistent-hdfs/conf/core-site.xml index dce3e91..1b9b4dc 100644 --- a/templates/root/persistent-hdfs/conf/core-site.xml +++ b/templates/root/persistent-hdfs/conf/core-site.xml @@ -15,6 +15,11 @@ hdfs://{{active_master}}:9010 + + fs.defaultFS + hdfs://{{active_master}}:9010 + + io.file.buffer.size 65536