From abab94c9676bdf2968c7126806b707d4d23b5dc0 Mon Sep 17 00:00:00 2001
From: Shivaram Venkataraman <shivaram@cs.berkeley.edu>
Date: Tue, 21 Oct 2014 12:46:55 -0700
Subject: [PATCH 01/11] Add support for Hadoop YARN 2.4

---
 ephemeral-hdfs/init.sh                        | 12 ++++++
 ephemeral-hdfs/setup-slave.sh                 |  4 ++
 ephemeral-hdfs/setup.sh                       | 21 ++++++++--
 mapreduce/init.sh                             |  3 ++
 spark/init.sh                                 |  4 +-
 .../root/ephemeral-hdfs/conf/core-site.xml    |  5 +++
 .../root/ephemeral-hdfs/conf/yarn-site.xml    | 40 +++++++++++++++++++
 templates/root/spark/conf/spark-env.sh        |  3 ++
 8 files changed, 88 insertions(+), 4 deletions(-)
 create mode 100644 templates/root/ephemeral-hdfs/conf/yarn-site.xml

diff --git a/ephemeral-hdfs/init.sh b/ephemeral-hdfs/init.sh
index e618aa5..a57d7f3 100755
--- a/ephemeral-hdfs/init.sh
+++ b/ephemeral-hdfs/init.sh
@@ -23,6 +23,18 @@ case "$HADOOP_MAJOR_VERSION" in
     rm hadoop-*.tar.gz
     mv hadoop-2.0.0-cdh4.2.0/ ephemeral-hdfs/
 
+    # Have single conf dir
+    rm -rf /root/ephemeral-hdfs/etc/hadoop/
+    ln -s /root/ephemeral-hdfs/conf /root/ephemeral-hdfs/etc/hadoop
+    ;;
+  yarn)
+    # TODO: Replace this with s3 bucket
+    wget http://www.us.apache.org/dist/hadoop/common/hadoop-2.4.0/hadoop-2.4.0.tar.gz
+    echo "Unpacking Hadoop"
+    tar xvzf hadoop-*.tar.gz > /tmp/spark-ec2_hadoop.log
+    rm hadoop-*.tar.gz
+    mv hadoop-2.4.0/ ephemeral-hdfs/
+
     # Have single conf dir
     rm -rf /root/ephemeral-hdfs/etc/hadoop/
     ln -s /root/ephemeral-hdfs/conf /root/ephemeral-hdfs/etc/hadoop
diff --git a/ephemeral-hdfs/setup-slave.sh b/ephemeral-hdfs/setup-slave.sh
index 23871d2..a85c4df 100755
--- a/ephemeral-hdfs/setup-slave.sh
+++ b/ephemeral-hdfs/setup-slave.sh
@@ -4,6 +4,10 @@
 mkdir -p /mnt/ephemeral-hdfs/logs
 mkdir -p /mnt/hadoop-logs
 
+# Setup yarn logs, local dirs
+mkdir -p /mnt/yarn-local
+mkdir -p /mnt/yarn-logs
+
 # Create Hadoop and HDFS directories in a given parent directory
 # (for example /mnt, /mnt2, and so on)
 function create_hadoop_dirs {
diff --git a/ephemeral-hdfs/setup.sh b/ephemeral-hdfs/setup.sh
index d08b754..2dbc1a5 100755
--- a/ephemeral-hdfs/setup.sh
+++ b/ephemeral-hdfs/setup.sh
@@ -27,8 +27,23 @@ else
 fi
 
 echo "Starting ephemeral HDFS..."
-# This is different depending on version. Simple hack: just try both.
-$EPHEMERAL_HDFS/sbin/start-dfs.sh
-$EPHEMERAL_HDFS/bin/start-dfs.sh
+
+# This is different depending on version.
+case "$HADOOP_MAJOR_VERSION" in
+  1)
+    $EPHEMERAL_HDFS/bin/start-dfs.sh
+    ;;
+  2)
+    $EPHEMERAL_HDFS/sbin/start-dfs.sh
+    ;;
+  yarn) 
+    $EPHEMERAL_HDFS/sbin/start-dfs.sh
+    echo "Starting YARN"
+    $EPHEMERAL_HDFS/sbin/start-yarn.sh
+    ;;
+  *)
+     echo "ERROR: Unknown Hadoop version"
+     return -1
+esac
 
 popd
diff --git a/mapreduce/init.sh b/mapreduce/init.sh
index 8f5ce3f..297d6bb 100755
--- a/mapreduce/init.sh
+++ b/mapreduce/init.sh
@@ -11,6 +11,9 @@ case "$HADOOP_MAJOR_VERSION" in
     rm mr1-*.tar.gz
     mv hadoop-2.0.0-mr1-cdh4.2.0/ mapreduce/
     ;;
+  2.4-yarn)
+    echo "Nothing to initialize for MapReduce in Hadoop 2 YARN"
+    ;;
 
   *)
      echo "ERROR: Unknown Hadoop version"
diff --git a/spark/init.sh b/spark/init.sh
index 4f1bb38..eca76a7 100755
--- a/spark/init.sh
+++ b/spark/init.sh
@@ -91,8 +91,10 @@ else
     1.1.0)
       if [[ "$HADOOP_MAJOR_VERSION" == "1" ]]; then
         wget http://s3.amazonaws.com/spark-related-packages/spark-1.1.0-bin-hadoop1.tgz
-      else
+      else if [[ "$HADOOP_MAJOR_VERSION" == "2" ]]; then
         wget http://s3.amazonaws.com/spark-related-packages/spark-1.1.0-bin-cdh4.tgz
+      else
+        wget http://s3.amazonaws.com/spark-related-packages/spark-1.1.0-bin-hadoop2.4.tgz
       fi
       ;;
     *)
diff --git a/templates/root/ephemeral-hdfs/conf/core-site.xml b/templates/root/ephemeral-hdfs/conf/core-site.xml
index 190e594..2aad2aa 100644
--- a/templates/root/ephemeral-hdfs/conf/core-site.xml
+++ b/templates/root/ephemeral-hdfs/conf/core-site.xml
@@ -15,6 +15,11 @@
     <value>hdfs://{{active_master}}:9000</value>
   </property>
 
+  <property>
+    <name>fs.defaultFS</name>
+    <value>hdfs://{{active_master}}:9000</value>
+  </property>
+
   <property>
     <name>io.file.buffer.size</name>
     <value>65536</value>
diff --git a/templates/root/ephemeral-hdfs/conf/yarn-site.xml b/templates/root/ephemeral-hdfs/conf/yarn-site.xml
new file mode 100644
index 0000000..5abaec6
--- /dev/null
+++ b/templates/root/ephemeral-hdfs/conf/yarn-site.xml
@@ -0,0 +1,40 @@
+<?xml version="1.0"?>
+<!--
+  Licensed under the Apache License, Version 2.0 (the "License");
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License. See accompanying LICENSE file.
+-->
+<configuration>
+
+<!-- Site specific YARN configuration properties -->
+
+  <property>
+    <name>yarn.resourcemanager.hostname</name>
+    <value>{{active_master}}</value>
+  </property>
+
+  <!-- TODO: Should we use multiple local-dirs -->
+  <property>
+    <name>yarn.nodemanager.local-dirs</name>
+    <value>/mnt/yarn-local</value>
+  </property>
+
+  <property>
+    <name>yarn.nodemanager.log-dirs</name>
+    <value>/mnt/yarn-logs</value>
+  </property>
+
+  <property>
+    <name>yarn.log-aggregation-enable</name>
+    <value>true</value>
+  </property>
+
+</configuration>
diff --git a/templates/root/spark/conf/spark-env.sh b/templates/root/spark/conf/spark-env.sh
index fd57334..27fd003 100755
--- a/templates/root/spark/conf/spark-env.sh
+++ b/templates/root/spark/conf/spark-env.sh
@@ -17,5 +17,8 @@ export SPARK_SUBMIT_CLASSPATH="$SPARK_CLASSPATH:$SPARK_SUBMIT_CLASSPATH:/root/ep
 # Bind Spark's web UIs to this machine's public EC2 hostname:
 export SPARK_PUBLIC_DNS=`wget -q -O - http://169.254.169.254/latest/meta-data/public-hostname`
 
+# Used for YARN model
+export YARN_CONF_DIR="/root/ephemeral-hdfs/conf"
+
 # Set a high ulimit for large shuffles
 ulimit -n 1000000

From 99805d95d55286e96426e358f141b9ddd127c8bd Mon Sep 17 00:00:00 2001
From: Shivaram Venkataraman <shivaram@cs.berkeley.edu>
Date: Tue, 21 Oct 2014 12:47:26 -0700
Subject: [PATCH 02/11] Add yarn-env.sh

---
 .../root/ephemeral-hdfs/conf/yarn-env.sh      | 126 ++++++++++++++++++
 1 file changed, 126 insertions(+)
 create mode 100644 templates/root/ephemeral-hdfs/conf/yarn-env.sh

diff --git a/templates/root/ephemeral-hdfs/conf/yarn-env.sh b/templates/root/ephemeral-hdfs/conf/yarn-env.sh
new file mode 100644
index 0000000..77e6219
--- /dev/null
+++ b/templates/root/ephemeral-hdfs/conf/yarn-env.sh
@@ -0,0 +1,126 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# User for YARN daemons
+export HADOOP_YARN_USER=${HADOOP_YARN_USER:-yarn}
+
+# resolve links - $0 may be a softlink
+#export YARN_CONF_DIR="${YARN_CONF_DIR:-$HADOOP_YARN_HOME/conf}"
+export YARN_CONF_DIR="/root/ephemeral-hdfs/conf"
+
+# some Java parameters
+# export JAVA_HOME=/home/y/libexec/jdk1.6.0/
+#if [ "$JAVA_HOME" != "" ]; then
+#  #echo "run java in $JAVA_HOME"
+#  JAVA_HOME=$JAVA_HOME
+#fi
+#  
+#if [ "$JAVA_HOME" = "" ]; then
+#  echo "Error: JAVA_HOME is not set."
+#  exit 1
+#fi
+
+export JAVA_HOME={{java_home}}
+
+JAVA=$JAVA_HOME/bin/java
+JAVA_HEAP_MAX=-Xmx1000m 
+
+# For setting YARN specific HEAP sizes please use this
+# Parameter and set appropriately
+export YARN_HEAPSIZE=1000
+
+# check envvars which might override default args
+if [ "$YARN_HEAPSIZE" != "" ]; then
+  JAVA_HEAP_MAX="-Xmx""$YARN_HEAPSIZE""m"
+fi
+
+# Resource Manager specific parameters
+
+# Specify the max Heapsize for the ResourceManager using a numerical value
+# in the scale of MB. For example, to specify an jvm option of -Xmx1000m, set
+# the value to 1000.
+# This value will be overridden by an Xmx setting specified in either YARN_OPTS
+# and/or YARN_RESOURCEMANAGER_OPTS.
+# If not specified, the default value will be picked from either YARN_HEAPMAX
+# or JAVA_HEAP_MAX with YARN_HEAPMAX as the preferred option of the two.
+#export YARN_RESOURCEMANAGER_HEAPSIZE=1000
+
+# Specify the max Heapsize for the timeline server using a numerical value
+# in the scale of MB. For example, to specify an jvm option of -Xmx1000m, set
+# the value to 1000.
+# This value will be overridden by an Xmx setting specified in either YARN_OPTS
+# and/or YARN_TIMELINESERVER_OPTS.
+# If not specified, the default value will be picked from either YARN_HEAPMAX
+# or JAVA_HEAP_MAX with YARN_HEAPMAX as the preferred option of the two.
+#export YARN_TIMELINESERVER_HEAPSIZE=1000
+
+# Specify the JVM options to be used when starting the ResourceManager.
+# These options will be appended to the options specified as YARN_OPTS
+# and therefore may override any similar flags set in YARN_OPTS
+#export YARN_RESOURCEMANAGER_OPTS=
+
+# Node Manager specific parameters
+
+# Specify the max Heapsize for the NodeManager using a numerical value
+# in the scale of MB. For example, to specify an jvm option of -Xmx1000m, set
+# the value to 1000.
+# This value will be overridden by an Xmx setting specified in either YARN_OPTS
+# and/or YARN_NODEMANAGER_OPTS.
+# If not specified, the default value will be picked from either YARN_HEAPMAX
+# or JAVA_HEAP_MAX with YARN_HEAPMAX as the preferred option of the two.
+#export YARN_NODEMANAGER_HEAPSIZE=1000
+
+# Specify the JVM options to be used when starting the NodeManager.
+# These options will be appended to the options specified as YARN_OPTS
+# and therefore may override any similar flags set in YARN_OPTS
+#export YARN_NODEMANAGER_OPTS=
+
+# so that filenames w/ spaces are handled correctly in loops below
+IFS=
+
+
+# default log directory & file
+#if [ "$YARN_LOG_DIR" = "" ]; then
+#  YARN_LOG_DIR="$HADOOP_YARN_HOME/logs"
+#fi
+export YARN_LOG_DIR=/mnt/ephemeral-hdfs/logs
+
+if [ "$YARN_LOGFILE" = "" ]; then
+  YARN_LOGFILE='yarn.log'
+fi
+
+# default policy file for service-level authorization
+if [ "$YARN_POLICYFILE" = "" ]; then
+  YARN_POLICYFILE="hadoop-policy.xml"
+fi
+
+# restore ordinary behaviour
+unset IFS
+
+
+YARN_OPTS="$YARN_OPTS -Dhadoop.log.dir=$YARN_LOG_DIR"
+YARN_OPTS="$YARN_OPTS -Dyarn.log.dir=$YARN_LOG_DIR"
+YARN_OPTS="$YARN_OPTS -Dhadoop.log.file=$YARN_LOGFILE"
+YARN_OPTS="$YARN_OPTS -Dyarn.log.file=$YARN_LOGFILE"
+YARN_OPTS="$YARN_OPTS -Dyarn.home.dir=$YARN_COMMON_HOME"
+YARN_OPTS="$YARN_OPTS -Dyarn.id.str=$YARN_IDENT_STRING"
+YARN_OPTS="$YARN_OPTS -Dhadoop.root.logger=${YARN_ROOT_LOGGER:-INFO,console}"
+YARN_OPTS="$YARN_OPTS -Dyarn.root.logger=${YARN_ROOT_LOGGER:-INFO,console}"
+if [ "x$JAVA_LIBRARY_PATH" != "x" ]; then
+  YARN_OPTS="$YARN_OPTS -Djava.library.path=$JAVA_LIBRARY_PATH"
+fi  
+YARN_OPTS="$YARN_OPTS -Dyarn.policy.file=$YARN_POLICYFILE"
+
+

From caa969079254830824c77fc739c3771997143ed1 Mon Sep 17 00:00:00 2001
From: Shivaram Venkataraman <shivaram@cs.berkeley.edu>
Date: Tue, 21 Oct 2014 13:00:57 -0700
Subject: [PATCH 03/11] Fix Hadoop 2.4 url

---
 ephemeral-hdfs/init.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ephemeral-hdfs/init.sh b/ephemeral-hdfs/init.sh
index a57d7f3..f229b05 100755
--- a/ephemeral-hdfs/init.sh
+++ b/ephemeral-hdfs/init.sh
@@ -29,7 +29,7 @@ case "$HADOOP_MAJOR_VERSION" in
     ;;
   yarn)
     # TODO: Replace this with s3 bucket
-    wget http://www.us.apache.org/dist/hadoop/common/hadoop-2.4.0/hadoop-2.4.0.tar.gz
+    wget http://mirror.cogentco.com/pub/apache/hadoop/common/hadoop-2.4.0/hadoop-2.4.0.tar.gz
     echo "Unpacking Hadoop"
     tar xvzf hadoop-*.tar.gz > /tmp/spark-ec2_hadoop.log
     rm hadoop-*.tar.gz

From 1a9b25c51a97ee82cdb49bcfea2104165f632941 Mon Sep 17 00:00:00 2001
From: Shivaram Venkataraman <shivaram@cs.berkeley.edu>
Date: Tue, 21 Oct 2014 13:02:42 -0700
Subject: [PATCH 04/11] Fix hadoop version

---
 mapreduce/init.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mapreduce/init.sh b/mapreduce/init.sh
index 297d6bb..b6f39a1 100755
--- a/mapreduce/init.sh
+++ b/mapreduce/init.sh
@@ -11,7 +11,7 @@ case "$HADOOP_MAJOR_VERSION" in
     rm mr1-*.tar.gz
     mv hadoop-2.0.0-mr1-cdh4.2.0/ mapreduce/
     ;;
-  2.4-yarn)
+  yarn)
     echo "Nothing to initialize for MapReduce in Hadoop 2 YARN"
     ;;
 

From 67669c050444c9ed7b02b3137df088c170be07a8 Mon Sep 17 00:00:00 2001
From: Shivaram Venkataraman <shivaram@cs.berkeley.edu>
Date: Tue, 21 Oct 2014 13:08:41 -0700
Subject: [PATCH 05/11] Use elif

---
 spark/init.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spark/init.sh b/spark/init.sh
index eca76a7..2a6a5c3 100755
--- a/spark/init.sh
+++ b/spark/init.sh
@@ -91,7 +91,7 @@ else
     1.1.0)
       if [[ "$HADOOP_MAJOR_VERSION" == "1" ]]; then
         wget http://s3.amazonaws.com/spark-related-packages/spark-1.1.0-bin-hadoop1.tgz
-      else if [[ "$HADOOP_MAJOR_VERSION" == "2" ]]; then
+      elif [[ "$HADOOP_MAJOR_VERSION" == "2" ]]; then
         wget http://s3.amazonaws.com/spark-related-packages/spark-1.1.0-bin-cdh4.tgz
       else
         wget http://s3.amazonaws.com/spark-related-packages/spark-1.1.0-bin-hadoop2.4.tgz

From c6b81d38626918bf2f13af3e27155b403c6e3858 Mon Sep 17 00:00:00 2001
From: Shivaram Venkataraman <shivaram@cs.berkeley.edu>
Date: Tue, 21 Oct 2014 13:18:30 -0700
Subject: [PATCH 06/11] Workaround lack of hadoop-native security libraries

---
 templates/root/ephemeral-hdfs/conf/core-site.xml | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/templates/root/ephemeral-hdfs/conf/core-site.xml b/templates/root/ephemeral-hdfs/conf/core-site.xml
index 2aad2aa..65f030d 100644
--- a/templates/root/ephemeral-hdfs/conf/core-site.xml
+++ b/templates/root/ephemeral-hdfs/conf/core-site.xml
@@ -60,4 +60,9 @@
     <value>{{aws_secret_access_key}}</value>
   </property>
 
+  <property>
+    <name>hadoop.security.group.mapping</name>
+    <value>org.apache.hadoop.security.ShellBasedUnixGroupsMapping</value>
+  </property>
+
 </configuration>

From ee86a8919da2646014d18d91646b2605a355eb2e Mon Sep 17 00:00:00 2001
From: Shivaram Venkataraman <shivaram@cs.berkeley.edu>
Date: Tue, 21 Oct 2014 13:59:46 -0700
Subject: [PATCH 07/11] Add memory limits and capacity scheduler conf

---
 deploy_templates.py                           |   1 +
 .../conf/capacity-scheduler.xml               | 111 ++++++++++++++++++
 .../root/ephemeral-hdfs/conf/yarn-site.xml    |  10 ++
 3 files changed, 122 insertions(+)
 create mode 100644 templates/root/ephemeral-hdfs/conf/capacity-scheduler.xml

diff --git a/deploy_templates.py b/deploy_templates.py
index 2de3264..2801fdd 100755
--- a/deploy_templates.py
+++ b/deploy_templates.py
@@ -69,6 +69,7 @@
   "hadoop_major_version": os.getenv("HADOOP_MAJOR_VERSION"),
   "java_home": os.getenv("JAVA_HOME"),
   "default_tachyon_mem": "%dMB" % tachyon_mb,
+  "system_ram_mb": "%d" % system_ram_mb,
   "aws_access_key_id": os.getenv("AWS_ACCESS_KEY_ID"),
   "aws_secret_access_key": os.getenv("AWS_SECRET_ACCESS_KEY"),
 }
diff --git a/templates/root/ephemeral-hdfs/conf/capacity-scheduler.xml b/templates/root/ephemeral-hdfs/conf/capacity-scheduler.xml
new file mode 100644
index 0000000..2bed464
--- /dev/null
+++ b/templates/root/ephemeral-hdfs/conf/capacity-scheduler.xml
@@ -0,0 +1,111 @@
+<!--
+  Licensed under the Apache License, Version 2.0 (the "License");
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License. See accompanying LICENSE file.
+-->
+<configuration>
+
+  <property>
+    <name>yarn.scheduler.capacity.maximum-applications</name>
+    <value>10000</value>
+    <description>
+      Maximum number of applications that can be pending and running.
+    </description>
+  </property>
+
+  <property>
+    <name>yarn.scheduler.capacity.maximum-am-resource-percent</name>
+    <value>0.1</value>
+    <description>
+      Maximum percent of resources in the cluster which can be used to run 
+      application masters i.e. controls number of concurrent running
+      applications.
+    </description>
+  </property>
+
+  <property>
+    <name>yarn.scheduler.capacity.resource-calculator</name>
+    <value>org.apache.hadoop.yarn.util.resource.DefaultResourceCalculator</value>
+    <description>
+      The ResourceCalculator implementation to be used to compare 
+      Resources in the scheduler.
+      The default i.e. DefaultResourceCalculator only uses Memory while
+      DominantResourceCalculator uses dominant-resource to compare 
+      multi-dimensional resources such as Memory, CPU etc.
+    </description>
+  </property>
+
+  <property>
+    <name>yarn.scheduler.capacity.root.queues</name>
+    <value>default</value>
+    <description>
+      The queues at the this level (root is the root queue).
+    </description>
+  </property>
+
+  <property>
+    <name>yarn.scheduler.capacity.root.default.capacity</name>
+    <value>100</value>
+    <description>Default queue target capacity.</description>
+  </property>
+
+  <property>
+    <name>yarn.scheduler.capacity.root.default.user-limit-factor</name>
+    <value>1</value>
+    <description>
+      Default queue user limit a percentage from 0.0 to 1.0.
+    </description>
+  </property>
+
+  <property>
+    <name>yarn.scheduler.capacity.root.default.maximum-capacity</name>
+    <value>100</value>
+    <description>
+      The maximum capacity of the default queue. 
+    </description>
+  </property>
+
+  <property>
+    <name>yarn.scheduler.capacity.root.default.state</name>
+    <value>RUNNING</value>
+    <description>
+      The state of the default queue. State can be one of RUNNING or STOPPED.
+    </description>
+  </property>
+
+  <property>
+    <name>yarn.scheduler.capacity.root.default.acl_submit_applications</name>
+    <value>*</value>
+    <description>
+      The ACL of who can submit jobs to the default queue.
+    </description>
+  </property>
+
+  <property>
+    <name>yarn.scheduler.capacity.root.default.acl_administer_queue</name>
+    <value>*</value>
+    <description>
+      The ACL of who can administer jobs on the default queue.
+    </description>
+  </property>
+
+  <property>
+    <name>yarn.scheduler.capacity.node-locality-delay</name>
+    <value>40</value>
+    <description>
+      Number of missed scheduling opportunities after which the CapacityScheduler 
+      attempts to schedule rack-local containers. 
+      Typically this should be set to number of nodes in the cluster, By default is setting 
+      approximately number of nodes in one rack which is 40.
+    </description>
+  </property>
+
+</configuration>
diff --git a/templates/root/ephemeral-hdfs/conf/yarn-site.xml b/templates/root/ephemeral-hdfs/conf/yarn-site.xml
index 5abaec6..fe116de 100644
--- a/templates/root/ephemeral-hdfs/conf/yarn-site.xml
+++ b/templates/root/ephemeral-hdfs/conf/yarn-site.xml
@@ -37,4 +37,14 @@
     <value>true</value>
   </property>
 
+  <property>
+    <name>yarn.scheduler.maximum-allocation-mb</name>
+    <value>{{system_ram_mb}}</value>
+  </property>
+
+  <property>
+    <name>yarn.nodemanager.resource.memory-mb</name>
+    <value>{{system_ram_mb}}</value>
+  </property>
+
 </configuration>

From c5398b40b5734494b80b3596b8306630d8fffd12 Mon Sep 17 00:00:00 2001
From: Shivaram Venkataraman <shivaram@cs.berkeley.edu>
Date: Tue, 21 Oct 2014 14:05:40 -0700
Subject: [PATCH 08/11] Use S3 URL

---
 ephemeral-hdfs/init.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ephemeral-hdfs/init.sh b/ephemeral-hdfs/init.sh
index f229b05..fb5496c 100755
--- a/ephemeral-hdfs/init.sh
+++ b/ephemeral-hdfs/init.sh
@@ -29,7 +29,7 @@ case "$HADOOP_MAJOR_VERSION" in
     ;;
   yarn)
     # TODO: Replace this with s3 bucket
-    wget http://mirror.cogentco.com/pub/apache/hadoop/common/hadoop-2.4.0/hadoop-2.4.0.tar.gz
+    wget http://s3.amazonaws.com/spark-related-packages/hadoop-2.4.0.tar.gz
     echo "Unpacking Hadoop"
     tar xvzf hadoop-*.tar.gz > /tmp/spark-ec2_hadoop.log
     rm hadoop-*.tar.gz

From 6a5d932734d94588cf6f20c4744313a8876faaa1 Mon Sep 17 00:00:00 2001
From: Shivaram Venkataraman <shivaram@cs.berkeley.edu>
Date: Tue, 21 Oct 2014 14:38:15 -0700
Subject: [PATCH 09/11] Changes to get mapred to work with YARN

---
 templates/root/ephemeral-hdfs/conf/mapred-site.xml |  5 +++++
 templates/root/ephemeral-hdfs/conf/yarn-site.xml   | 10 ++++++++++
 2 files changed, 15 insertions(+)

diff --git a/templates/root/ephemeral-hdfs/conf/mapred-site.xml b/templates/root/ephemeral-hdfs/conf/mapred-site.xml
index b1637dc..3efb463 100644
--- a/templates/root/ephemeral-hdfs/conf/mapred-site.xml
+++ b/templates/root/ephemeral-hdfs/conf/mapred-site.xml
@@ -5,6 +5,11 @@
 
 <configuration>
 
+  <property>
+    <name>mapreduce.framework.name</name>
+    <value>yarn</value>
+  </property>
+
   <property>
     <name>mapred.job.tracker</name>
     <value>{{active_master}}:9001</value>
diff --git a/templates/root/ephemeral-hdfs/conf/yarn-site.xml b/templates/root/ephemeral-hdfs/conf/yarn-site.xml
index fe116de..2d01ebb 100644
--- a/templates/root/ephemeral-hdfs/conf/yarn-site.xml
+++ b/templates/root/ephemeral-hdfs/conf/yarn-site.xml
@@ -47,4 +47,14 @@
     <value>{{system_ram_mb}}</value>
   </property>
 
+  <property>
+    <name>yarn.nodemanager.aux-services</name>
+    <value>mapreduce_shuffle</value>
+  </property>
+
+  <property>
+    <name>yarn.nodemanager.aux-services.mapreduce.shuffle.class</name>
+    <value>org.apache.hadoop.mapred.ShuffleHandler</value>
+  </property>
+
 </configuration>

From 61fa9e64757c7326b0f4914494cfcdfd36a44091 Mon Sep 17 00:00:00 2001
From: Shivaram Venkataraman <shivaram@cs.berkeley.edu>
Date: Tue, 21 Oct 2014 14:39:13 -0700
Subject: [PATCH 10/11] Remove obsolete TODO

---
 ephemeral-hdfs/init.sh | 1 -
 1 file changed, 1 deletion(-)

diff --git a/ephemeral-hdfs/init.sh b/ephemeral-hdfs/init.sh
index fb5496c..99fdffa 100755
--- a/ephemeral-hdfs/init.sh
+++ b/ephemeral-hdfs/init.sh
@@ -28,7 +28,6 @@ case "$HADOOP_MAJOR_VERSION" in
     ln -s /root/ephemeral-hdfs/conf /root/ephemeral-hdfs/etc/hadoop
     ;;
   yarn)
-    # TODO: Replace this with s3 bucket
     wget http://s3.amazonaws.com/spark-related-packages/hadoop-2.4.0.tar.gz
     echo "Unpacking Hadoop"
     tar xvzf hadoop-*.tar.gz > /tmp/spark-ec2_hadoop.log

From d78b7202cc708951e7e6a225a90dda2dec9a0f7e Mon Sep 17 00:00:00 2001
From: Shivaram Venkataraman <shivaram@cs.berkeley.edu>
Date: Tue, 21 Oct 2014 14:47:05 -0700
Subject: [PATCH 11/11] Add persistent HDFS based on YARN

---
 persistent-hdfs/init.sh                           | 11 +++++++++++
 templates/root/persistent-hdfs/conf/core-site.xml |  5 +++++
 2 files changed, 16 insertions(+)

diff --git a/persistent-hdfs/init.sh b/persistent-hdfs/init.sh
index 636a043..690dd93 100755
--- a/persistent-hdfs/init.sh
+++ b/persistent-hdfs/init.sh
@@ -22,6 +22,17 @@ case "$HADOOP_MAJOR_VERSION" in
     rm hadoop-*.tar.gz
     mv hadoop-2.0.0-cdh4.2.0/ persistent-hdfs/
 
+    # Have single conf dir
+    rm -rf /root/persistent-hdfs/etc/hadoop/
+    ln -s /root/persistent-hdfs/conf /root/persistent-hdfs/etc/hadoop
+    ;;
+  yarn)
+    wget http://s3.amazonaws.com/spark-related-packages/hadoop-2.4.0.tar.gz
+    echo "Unpacking Hadoop"
+    tar xvzf hadoop-*.tar.gz > /tmp/spark-ec2_hadoop.log
+    rm hadoop-*.tar.gz
+    mv hadoop-2.4.0/ persistent-hdfs/
+
     # Have single conf dir
     rm -rf /root/persistent-hdfs/etc/hadoop/
     ln -s /root/persistent-hdfs/conf /root/persistent-hdfs/etc/hadoop
diff --git a/templates/root/persistent-hdfs/conf/core-site.xml b/templates/root/persistent-hdfs/conf/core-site.xml
index dce3e91..1b9b4dc 100644
--- a/templates/root/persistent-hdfs/conf/core-site.xml
+++ b/templates/root/persistent-hdfs/conf/core-site.xml
@@ -15,6 +15,11 @@
     <value>hdfs://{{active_master}}:9010</value>
   </property>
 
+  <property>
+    <name>fs.defaultFS</name>
+    <value>hdfs://{{active_master}}:9010</value>
+  </property>
+
   <property>
     <name>io.file.buffer.size</name>
     <value>65536</value>