mesos · shivaram · Oct 21, 2014 · Oct 21, 2014 · Oct 21, 2014 · Oct 21, 2014
diff --git a/deploy_templates.py b/deploy_templates.py
@@ -69,6 +69,7 @@
   "hadoop_major_version": os.getenv("HADOOP_MAJOR_VERSION"),
   "java_home": os.getenv("JAVA_HOME"),
   "default_tachyon_mem": "%dMB" % tachyon_mb,
+  "system_ram_mb": "%d" % system_ram_mb,
   "aws_access_key_id": os.getenv("AWS_ACCESS_KEY_ID"),
   "aws_secret_access_key": os.getenv("AWS_SECRET_ACCESS_KEY"),
 }

diff --git a/ephemeral-hdfs/init.sh b/ephemeral-hdfs/init.sh
@@ -23,6 +23,17 @@ case "$HADOOP_MAJOR_VERSION" in
     rm hadoop-*.tar.gz
     mv hadoop-2.0.0-cdh4.2.0/ ephemeral-hdfs/
 
+    # Have single conf dir
+    rm -rf /root/ephemeral-hdfs/etc/hadoop/
+    ln -s /root/ephemeral-hdfs/conf /root/ephemeral-hdfs/etc/hadoop
+    ;;
+  yarn)
+    wget http://s3.amazonaws.com/spark-related-packages/hadoop-2.4.0.tar.gz
+    echo "Unpacking Hadoop"
+    tar xvzf hadoop-*.tar.gz > /tmp/spark-ec2_hadoop.log
+    rm hadoop-*.tar.gz
+    mv hadoop-2.4.0/ ephemeral-hdfs/
+
     # Have single conf dir
     rm -rf /root/ephemeral-hdfs/etc/hadoop/
     ln -s /root/ephemeral-hdfs/conf /root/ephemeral-hdfs/etc/hadoop

diff --git a/ephemeral-hdfs/setup-slave.sh b/ephemeral-hdfs/setup-slave.sh
@@ -4,6 +4,10 @@
 mkdir -p /mnt/ephemeral-hdfs/logs
 mkdir -p /mnt/hadoop-logs
 
+# Setup yarn logs, local dirs
+mkdir -p /mnt/yarn-local
+mkdir -p /mnt/yarn-logs
+
 # Create Hadoop and HDFS directories in a given parent directory
 # (for example /mnt, /mnt2, and so on)
 function create_hadoop_dirs {

diff --git a/ephemeral-hdfs/setup.sh b/ephemeral-hdfs/setup.sh
@@ -27,8 +27,23 @@ else
 fi
 
 echo "Starting ephemeral HDFS..."
-# This is different depending on version. Simple hack: just try both.
-$EPHEMERAL_HDFS/sbin/start-dfs.sh
-$EPHEMERAL_HDFS/bin/start-dfs.sh
+
+# This is different depending on version.
+case "$HADOOP_MAJOR_VERSION" in
+  1)
+    $EPHEMERAL_HDFS/bin/start-dfs.sh
+    ;;
+  2)
+    $EPHEMERAL_HDFS/sbin/start-dfs.sh
+    ;;
+  yarn) 
+    $EPHEMERAL_HDFS/sbin/start-dfs.sh
+    echo "Starting YARN"
+    $EPHEMERAL_HDFS/sbin/start-yarn.sh
+    ;;
+  *)
+     echo "ERROR: Unknown Hadoop version"
+     return -1
+esac
 
 popd
diff --git a/mapreduce/init.sh b/mapreduce/init.sh
@@ -11,6 +11,9 @@ case "$HADOOP_MAJOR_VERSION" in
     rm mr1-*.tar.gz
     mv hadoop-2.0.0-mr1-cdh4.2.0/ mapreduce/
     ;;
+  yarn)
+    echo "Nothing to initialize for MapReduce in Hadoop 2 YARN"
+    ;;
 
   *)
      echo "ERROR: Unknown Hadoop version"

diff --git a/persistent-hdfs/init.sh b/persistent-hdfs/init.sh
@@ -22,6 +22,17 @@ case "$HADOOP_MAJOR_VERSION" in
     rm hadoop-*.tar.gz
     mv hadoop-2.0.0-cdh4.2.0/ persistent-hdfs/
 
+    # Have single conf dir
+    rm -rf /root/persistent-hdfs/etc/hadoop/
+    ln -s /root/persistent-hdfs/conf /root/persistent-hdfs/etc/hadoop
+    ;;
+  yarn)
+    wget http://s3.amazonaws.com/spark-related-packages/hadoop-2.4.0.tar.gz
+    echo "Unpacking Hadoop"
+    tar xvzf hadoop-*.tar.gz > /tmp/spark-ec2_hadoop.log
+    rm hadoop-*.tar.gz
+    mv hadoop-2.4.0/ persistent-hdfs/
+
     # Have single conf dir
     rm -rf /root/persistent-hdfs/etc/hadoop/
     ln -s /root/persistent-hdfs/conf /root/persistent-hdfs/etc/hadoop

diff --git a/spark/init.sh b/spark/init.sh
@@ -91,8 +91,10 @@ else
     1.1.0)
       if [[ "$HADOOP_MAJOR_VERSION" == "1" ]]; then
         wget http://s3.amazonaws.com/spark-related-packages/spark-1.1.0-bin-hadoop1.tgz
-      else
+      elif [[ "$HADOOP_MAJOR_VERSION" == "2" ]]; then
         wget http://s3.amazonaws.com/spark-related-packages/spark-1.1.0-bin-cdh4.tgz
+      else
+        wget http://s3.amazonaws.com/spark-related-packages/spark-1.1.0-bin-hadoop2.4.tgz
       fi
       ;;
     *)

diff --git a/templates/root/ephemeral-hdfs/conf/capacity-scheduler.xml b/templates/root/ephemeral-hdfs/conf/capacity-scheduler.xml
@@ -0,0 +1,111 @@
+<!--
+  Licensed under the Apache License, Version 2.0 (the "License");
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License. See accompanying LICENSE file.
+-->
+<configuration>
+
+  <property>
+    <name>yarn.scheduler.capacity.maximum-applications</name>
+    <value>10000</value>
+    <description>
+      Maximum number of applications that can be pending and running.
+    </description>
+  </property>
+
+  <property>
+    <name>yarn.scheduler.capacity.maximum-am-resource-percent</name>
+    <value>0.1</value>
+    <description>
+      Maximum percent of resources in the cluster which can be used to run 
+      application masters i.e. controls number of concurrent running
+      applications.
+    </description>
+  </property>
+
+  <property>
+    <name>yarn.scheduler.capacity.resource-calculator</name>
+    <value>org.apache.hadoop.yarn.util.resource.DefaultResourceCalculator</value>
+    <description>
+      The ResourceCalculator implementation to be used to compare 
+      Resources in the scheduler.
+      The default i.e. DefaultResourceCalculator only uses Memory while
+      DominantResourceCalculator uses dominant-resource to compare 
+      multi-dimensional resources such as Memory, CPU etc.
+    </description>
+  </property>
+
+  <property>
+    <name>yarn.scheduler.capacity.root.queues</name>
+    <value>default</value>
+    <description>
+      The queues at the this level (root is the root queue).
+    </description>
+  </property>
+
+  <property>
+    <name>yarn.scheduler.capacity.root.default.capacity</name>
+    <value>100</value>
+    <description>Default queue target capacity.</description>
+  </property>
+
+  <property>
+    <name>yarn.scheduler.capacity.root.default.user-limit-factor</name>
+    <value>1</value>
+    <description>
+      Default queue user limit a percentage from 0.0 to 1.0.
+    </description>
+  </property>
+
+  <property>
+    <name>yarn.scheduler.capacity.root.default.maximum-capacity</name>
+    <value>100</value>
+    <description>
+      The maximum capacity of the default queue. 
+    </description>
+  </property>
+
+  <property>
+    <name>yarn.scheduler.capacity.root.default.state</name>
+    <value>RUNNING</value>
+    <description>
+      The state of the default queue. State can be one of RUNNING or STOPPED.
+    </description>
+  </property>
+
+  <property>
+    <name>yarn.scheduler.capacity.root.default.acl_submit_applications</name>
+    <value>*</value>
+    <description>
+      The ACL of who can submit jobs to the default queue.
+    </description>
+  </property>
+
+  <property>
+    <name>yarn.scheduler.capacity.root.default.acl_administer_queue</name>
+    <value>*</value>
+    <description>
+      The ACL of who can administer jobs on the default queue.
+    </description>
+  </property>
+
+  <property>
+    <name>yarn.scheduler.capacity.node-locality-delay</name>
+    <value>40</value>
+    <description>
+      Number of missed scheduling opportunities after which the CapacityScheduler 
+      attempts to schedule rack-local containers. 
+      Typically this should be set to number of nodes in the cluster, By default is setting 
+      approximately number of nodes in one rack which is 40.
+    </description>
+  </property>
+
+</configuration>
diff --git a/templates/root/ephemeral-hdfs/conf/core-site.xml b/templates/root/ephemeral-hdfs/conf/core-site.xml
@@ -15,6 +15,11 @@
     <value>hdfs://{{active_master}}:9000</value>
   </property>
 
+  <property>
+    <name>fs.defaultFS</name>
+    <value>hdfs://{{active_master}}:9000</value>
+  </property>
+
   <property>
     <name>io.file.buffer.size</name>
     <value>65536</value>
@@ -55,4 +60,9 @@
     <value>{{aws_secret_access_key}}</value>
   </property>
 
+  <property>
+    <name>hadoop.security.group.mapping</name>
+    <value>org.apache.hadoop.security.ShellBasedUnixGroupsMapping</value>
+  </property>
+
 </configuration>
diff --git a/templates/root/ephemeral-hdfs/conf/mapred-site.xml b/templates/root/ephemeral-hdfs/conf/mapred-site.xml
@@ -5,6 +5,11 @@
 
 <configuration>
 
+  <property>
+    <name>mapreduce.framework.name</name>
+    <value>yarn</value>
+  </property>
+
   <property>
     <name>mapred.job.tracker</name>
     <value>{{active_master}}:9001</value>

diff --git a/templates/root/ephemeral-hdfs/conf/yarn-env.sh b/templates/root/ephemeral-hdfs/conf/yarn-env.sh
@@ -0,0 +1,126 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# User for YARN daemons
+export HADOOP_YARN_USER=${HADOOP_YARN_USER:-yarn}
+
+# resolve links - $0 may be a softlink
+#export YARN_CONF_DIR="${YARN_CONF_DIR:-$HADOOP_YARN_HOME/conf}"
+export YARN_CONF_DIR="/root/ephemeral-hdfs/conf"
+
+# some Java parameters
+# export JAVA_HOME=/home/y/libexec/jdk1.6.0/
+#if [ "$JAVA_HOME" != "" ]; then
+#  #echo "run java in $JAVA_HOME"
+#  JAVA_HOME=$JAVA_HOME
+#fi
+#  
+#if [ "$JAVA_HOME" = "" ]; then
+#  echo "Error: JAVA_HOME is not set."
+#  exit 1
+#fi
+
+export JAVA_HOME={{java_home}}
+
+JAVA=$JAVA_HOME/bin/java
+JAVA_HEAP_MAX=-Xmx1000m 
+
+# For setting YARN specific HEAP sizes please use this
+# Parameter and set appropriately
+export YARN_HEAPSIZE=1000
+
+# check envvars which might override default args
+if [ "$YARN_HEAPSIZE" != "" ]; then
+  JAVA_HEAP_MAX="-Xmx""$YARN_HEAPSIZE""m"
+fi
+
+# Resource Manager specific parameters
+
+# Specify the max Heapsize for the ResourceManager using a numerical value
+# in the scale of MB. For example, to specify an jvm option of -Xmx1000m, set
+# the value to 1000.
+# This value will be overridden by an Xmx setting specified in either YARN_OPTS
+# and/or YARN_RESOURCEMANAGER_OPTS.
+# If not specified, the default value will be picked from either YARN_HEAPMAX
+# or JAVA_HEAP_MAX with YARN_HEAPMAX as the preferred option of the two.
+#export YARN_RESOURCEMANAGER_HEAPSIZE=1000
+
+# Specify the max Heapsize for the timeline server using a numerical value
+# in the scale of MB. For example, to specify an jvm option of -Xmx1000m, set
+# the value to 1000.
+# This value will be overridden by an Xmx setting specified in either YARN_OPTS
+# and/or YARN_TIMELINESERVER_OPTS.
+# If not specified, the default value will be picked from either YARN_HEAPMAX
+# or JAVA_HEAP_MAX with YARN_HEAPMAX as the preferred option of the two.
+#export YARN_TIMELINESERVER_HEAPSIZE=1000
+
+# Specify the JVM options to be used when starting the ResourceManager.
+# These options will be appended to the options specified as YARN_OPTS
+# and therefore may override any similar flags set in YARN_OPTS
+#export YARN_RESOURCEMANAGER_OPTS=
+
+# Node Manager specific parameters
+
+# Specify the max Heapsize for the NodeManager using a numerical value
+# in the scale of MB. For example, to specify an jvm option of -Xmx1000m, set
+# the value to 1000.
+# This value will be overridden by an Xmx setting specified in either YARN_OPTS
+# and/or YARN_NODEMANAGER_OPTS.
+# If not specified, the default value will be picked from either YARN_HEAPMAX
+# or JAVA_HEAP_MAX with YARN_HEAPMAX as the preferred option of the two.
+#export YARN_NODEMANAGER_HEAPSIZE=1000
+
+# Specify the JVM options to be used when starting the NodeManager.
+# These options will be appended to the options specified as YARN_OPTS
+# and therefore may override any similar flags set in YARN_OPTS
+#export YARN_NODEMANAGER_OPTS=
+
+# so that filenames w/ spaces are handled correctly in loops below
+IFS=
+
+
+# default log directory & file
+#if [ "$YARN_LOG_DIR" = "" ]; then
+#  YARN_LOG_DIR="$HADOOP_YARN_HOME/logs"
+#fi
+export YARN_LOG_DIR=/mnt/ephemeral-hdfs/logs
+
+if [ "$YARN_LOGFILE" = "" ]; then
+  YARN_LOGFILE='yarn.log'
+fi
+
+# default policy file for service-level authorization
+if [ "$YARN_POLICYFILE" = "" ]; then
+  YARN_POLICYFILE="hadoop-policy.xml"
+fi
+
+# restore ordinary behaviour
+unset IFS
+
+
+YARN_OPTS="$YARN_OPTS -Dhadoop.log.dir=$YARN_LOG_DIR"
+YARN_OPTS="$YARN_OPTS -Dyarn.log.dir=$YARN_LOG_DIR"
+YARN_OPTS="$YARN_OPTS -Dhadoop.log.file=$YARN_LOGFILE"
+YARN_OPTS="$YARN_OPTS -Dyarn.log.file=$YARN_LOGFILE"
+YARN_OPTS="$YARN_OPTS -Dyarn.home.dir=$YARN_COMMON_HOME"
+YARN_OPTS="$YARN_OPTS -Dyarn.id.str=$YARN_IDENT_STRING"
+YARN_OPTS="$YARN_OPTS -Dhadoop.root.logger=${YARN_ROOT_LOGGER:-INFO,console}"
+YARN_OPTS="$YARN_OPTS -Dyarn.root.logger=${YARN_ROOT_LOGGER:-INFO,console}"
+if [ "x$JAVA_LIBRARY_PATH" != "x" ]; then
+  YARN_OPTS="$YARN_OPTS -Djava.library.path=$JAVA_LIBRARY_PATH"
+fi  
+YARN_OPTS="$YARN_OPTS -Dyarn.policy.file=$YARN_POLICYFILE"
+
+