Skip to content
This repository has been archived by the owner on Jan 9, 2019. It is now read-only.

Commit

Permalink
Adding mapreduce example
Browse files Browse the repository at this point in the history
  • Loading branch information
matthewmccullough committed Jan 13, 2010
1 parent 75e088a commit 4d1d85a
Show file tree
Hide file tree
Showing 15 changed files with 559 additions and 0 deletions.
1 change: 1 addition & 0 deletions mapreduce-example/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
output
98 changes: 98 additions & 0 deletions mapreduce-example/input/capacity-scheduler.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
<?xml version="1.0"?>

<!-- This is the configuration file for the resource manager in Hadoop. -->
<!-- You can configure various scheduling parameters related to queues. -->
<!-- The properties for a queue follow a naming convention,such as, -->
<!-- mapred.capacity-scheduler.queue.<queue-name>.property-name. -->

<configuration>

<property>
<name>mapred.capacity-scheduler.queue.default.capacity</name>
<value>100</value>
<description>Percentage of the number of slots in the cluster that are
to be available for jobs in this queue.
</description>
</property>

<property>
<name>mapred.capacity-scheduler.queue.default.supports-priority</name>
<value>false</value>
<description>If true, priorities of jobs will be taken into
account in scheduling decisions.
</description>
</property>

<property>
<name>mapred.capacity-scheduler.queue.default.minimum-user-limit-percent</name>
<value>100</value>
<description> Each queue enforces a limit on the percentage of resources
allocated to a user at any given time, if there is competition for them.
This user limit can vary between a minimum and maximum value. The former
depends on the number of users who have submitted jobs, and the latter is
set to this property value. For example, suppose the value of this
property is 25. If two users have submitted jobs to a queue, no single
user can use more than 50% of the queue resources. If a third user submits
a job, no single user can use more than 33% of the queue resources. With 4
or more users, no user can use more than 25% of the queue's resources. A
value of 100 implies no user limits are imposed.
</description>
</property>
<property>
<name>mapred.capacity-scheduler.queue.default.maximum-initialized-jobs-per-user</name>
<value>2</value>
<description>The maximum number of jobs to be pre-initialized for a user
of the job queue.
</description>
</property>

<!-- The default configuration settings for the capacity task scheduler -->
<!-- The default values would be applied to all the queues which don't have -->
<!-- the appropriate property for the particular queue -->
<property>
<name>mapred.capacity-scheduler.default-supports-priority</name>
<value>false</value>
<description>If true, priorities of jobs will be taken into
account in scheduling decisions by default in a job queue.
</description>
</property>

<property>
<name>mapred.capacity-scheduler.default-minimum-user-limit-percent</name>
<value>100</value>
<description>The percentage of the resources limited to a particular user
for the job queue at any given point of time by default.
</description>
</property>

<property>
<name>mapred.capacity-scheduler.default-maximum-initialized-jobs-per-user</name>
<value>2</value>
<description>The maximum number of jobs to be pre-initialized for a user
of the job queue.
</description>
</property>


<!-- Capacity scheduler Job Initialization configuration parameters -->
<property>
<name>mapred.capacity-scheduler.init-poll-interval</name>
<value>5000</value>
<description>The amount of time in miliseconds which is used to poll
the job queues for jobs to initialize.
</description>
</property>
<property>
<name>mapred.capacity-scheduler.init-worker-threads</name>
<value>5</value>
<description>Number of worker threads which would be used by
Initialization poller to initialize jobs in a set of queue.
If number mentioned in property is equal to number of job queues
then a single thread would initialize jobs in a queue. If lesser
then a thread would get a set of queues assigned. If the number
is greater then number of threads would be equal to number of
job queues.
</description>
</property>

</configuration>
24 changes: 24 additions & 0 deletions mapreduce-example/input/configuration.xsl
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
<?xml version="1.0"?>
<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="1.0">
<xsl:output method="html"/>
<xsl:template match="configuration">
<html>
<body>
<table border="1">
<tr>
<td>name</td>
<td>value</td>
<td>description</td>
</tr>
<xsl:for-each select="property">
<tr>
<td><a name="{name}"><xsl:value-of select="name"/></a></td>
<td><xsl:value-of select="value"/></td>
<td><xsl:value-of select="description"/></td>
</tr>
</xsl:for-each>
</table>
</body>
</html>
</xsl:template>
</xsl:stylesheet>
8 changes: 8 additions & 0 deletions mapreduce-example/input/core-site.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
<?xml version="1.0"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>

<!-- Put site-specific property overrides in this file. -->

<configuration>

</configuration>
54 changes: 54 additions & 0 deletions mapreduce-example/input/hadoop-env.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
# Set Hadoop-specific environment variables here.

# The only required environment variable is JAVA_HOME. All others are
# optional. When running a distributed configuration it is best to
# set JAVA_HOME in this file, so that it is correctly defined on
# remote nodes.

# The java implementation to use. Required.
# export JAVA_HOME=/usr/lib/j2sdk1.5-sun

# Extra Java CLASSPATH elements. Optional.
# export HADOOP_CLASSPATH=

# The maximum amount of heap to use, in MB. Default is 1000.
# export HADOOP_HEAPSIZE=2000

# Extra Java runtime options. Empty by default.
# export HADOOP_OPTS=-server

# Command specific options appended to HADOOP_OPTS when specified
export HADOOP_NAMENODE_OPTS="-Dcom.sun.management.jmxremote $HADOOP_NAMENODE_OPTS"
export HADOOP_SECONDARYNAMENODE_OPTS="-Dcom.sun.management.jmxremote $HADOOP_SECONDARYNAMENODE_OPTS"
export HADOOP_DATANODE_OPTS="-Dcom.sun.management.jmxremote $HADOOP_DATANODE_OPTS"
export HADOOP_BALANCER_OPTS="-Dcom.sun.management.jmxremote $HADOOP_BALANCER_OPTS"
export HADOOP_JOBTRACKER_OPTS="-Dcom.sun.management.jmxremote $HADOOP_JOBTRACKER_OPTS"
# export HADOOP_TASKTRACKER_OPTS=
# The following applies to multiple commands (fs, dfs, fsck, distcp etc)
# export HADOOP_CLIENT_OPTS

# Extra ssh options. Empty by default.
# export HADOOP_SSH_OPTS="-o ConnectTimeout=1 -o SendEnv=HADOOP_CONF_DIR"

# Where log files are stored. $HADOOP_HOME/logs by default.
# export HADOOP_LOG_DIR=${HADOOP_HOME}/logs

# File naming remote slave hosts. $HADOOP_HOME/conf/slaves by default.
# export HADOOP_SLAVES=${HADOOP_HOME}/conf/slaves

# host:path where hadoop code should be rsync'd from. Unset by default.
# export HADOOP_MASTER=master:/home/$USER/src/hadoop

# Seconds to sleep between slave commands. Unset by default. This
# can be useful in large clusters, where, e.g., slave rsyncs can
# otherwise arrive faster than the master can service them.
# export HADOOP_SLAVE_SLEEP=0.1

# The directory where pid files are stored. /tmp by default.
# export HADOOP_PID_DIR=/var/hadoop/pids

# A string representing this instance of hadoop. $USER by default.
# export HADOOP_IDENT_STRING=$USER

# The scheduling priority for daemon processes. See 'man nice'.
# export HADOOP_NICENESS=10
40 changes: 40 additions & 0 deletions mapreduce-example/input/hadoop-metrics.properties
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
# Configuration of the "dfs" context for null
dfs.class=org.apache.hadoop.metrics.spi.NullContext

# Configuration of the "dfs" context for file
#dfs.class=org.apache.hadoop.metrics.file.FileContext
#dfs.period=10
#dfs.fileName=/tmp/dfsmetrics.log

# Configuration of the "dfs" context for ganglia
# dfs.class=org.apache.hadoop.metrics.ganglia.GangliaContext
# dfs.period=10
# dfs.servers=localhost:8649


# Configuration of the "mapred" context for null
mapred.class=org.apache.hadoop.metrics.spi.NullContext

# Configuration of the "mapred" context for file
#mapred.class=org.apache.hadoop.metrics.file.FileContext
#mapred.period=10
#mapred.fileName=/tmp/mrmetrics.log

# Configuration of the "mapred" context for ganglia
# mapred.class=org.apache.hadoop.metrics.ganglia.GangliaContext
# mapred.period=10
# mapred.servers=localhost:8649


# Configuration of the "jvm" context for null
jvm.class=org.apache.hadoop.metrics.spi.NullContext

# Configuration of the "jvm" context for file
#jvm.class=org.apache.hadoop.metrics.file.FileContext
#jvm.period=10
#jvm.fileName=/tmp/jvmmetrics.log

# Configuration of the "jvm" context for ganglia
# jvm.class=org.apache.hadoop.metrics.ganglia.GangliaContext
# jvm.period=10
# jvm.servers=localhost:8649
97 changes: 97 additions & 0 deletions mapreduce-example/input/hadoop-policy.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
<?xml version="1.0"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>

<!-- Put site-specific property overrides in this file. -->

<configuration>
<property>
<name>security.client.protocol.acl</name>
<value>*</value>
<description>ACL for ClientProtocol, which is used by user code
via the DistributedFileSystem.
The ACL is a comma-separated list of user and group names. The user and
group list is separated by a blank. For e.g. "alice,bob users,wheel".
A special value of "*" means all users are allowed.</description>
</property>

<property>
<name>security.client.datanode.protocol.acl</name>
<value>*</value>
<description>ACL for ClientDatanodeProtocol, the client-to-datanode protocol
for block recovery.
The ACL is a comma-separated list of user and group names. The user and
group list is separated by a blank. For e.g. "alice,bob users,wheel".
A special value of "*" means all users are allowed.</description>
</property>

<property>
<name>security.datanode.protocol.acl</name>
<value>*</value>
<description>ACL for DatanodeProtocol, which is used by datanodes to
communicate with the namenode.
The ACL is a comma-separated list of user and group names. The user and
group list is separated by a blank. For e.g. "alice,bob users,wheel".
A special value of "*" means all users are allowed.</description>
</property>

<property>
<name>security.inter.datanode.protocol.acl</name>
<value>*</value>
<description>ACL for InterDatanodeProtocol, the inter-datanode protocol
for updating generation timestamp.
The ACL is a comma-separated list of user and group names. The user and
group list is separated by a blank. For e.g. "alice,bob users,wheel".
A special value of "*" means all users are allowed.</description>
</property>

<property>
<name>security.namenode.protocol.acl</name>
<value>*</value>
<description>ACL for NamenodeProtocol, the protocol used by the secondary
namenode to communicate with the namenode.
The ACL is a comma-separated list of user and group names. The user and
group list is separated by a blank. For e.g. "alice,bob users,wheel".
A special value of "*" means all users are allowed.</description>
</property>

<property>
<name>security.inter.tracker.protocol.acl</name>
<value>*</value>
<description>ACL for InterTrackerProtocol, used by the tasktrackers to
communicate with the jobtracker.
The ACL is a comma-separated list of user and group names. The user and
group list is separated by a blank. For e.g. "alice,bob users,wheel".
A special value of "*" means all users are allowed.</description>
</property>

<property>
<name>security.job.submission.protocol.acl</name>
<value>*</value>
<description>ACL for JobSubmissionProtocol, used by job clients to
communciate with the jobtracker for job submission, querying job status etc.
The ACL is a comma-separated list of user and group names. The user and
group list is separated by a blank. For e.g. "alice,bob users,wheel".
A special value of "*" means all users are allowed.</description>
</property>

<property>
<name>security.task.umbilical.protocol.acl</name>
<value>*</value>
<description>ACL for TaskUmbilicalProtocol, used by the map and reduce
tasks to communicate with the parent tasktracker.
The ACL is a comma-separated list of user and group names. The user and
group list is separated by a blank. For e.g. "alice,bob users,wheel".
A special value of "*" means all users are allowed.</description>
</property>

<property>
<name>security.refresh.policy.protocol.acl</name>
<value>*</value>
<description>ACL for RefreshAuthorizationPolicyProtocol, used by the
dfsadmin and mradmin commands to refresh the security policy in-effect.
The ACL is a comma-separated list of user and group names. The user and
group list is separated by a blank. For e.g. "alice,bob users,wheel".
A special value of "*" means all users are allowed.</description>
</property>

</configuration>
20 changes: 20 additions & 0 deletions mapreduce-example/input/hdfs-site.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
<?xml version="1.0"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>

<!-- Put site-specific property overrides in this file. -->

<configuration>
<!-- Matthew 2010-01-13 -->
<property>
<name>fs.default.name</name>
<value>localhost:9000</value>
</property>
<property>
<name>mapred.job.tracker</name>
<value>localhost:9001</value>
</property>
<property>
<name>dfs.replication</name>
<value>1</value>
</property>
</configuration>
Loading

0 comments on commit 4d1d85a

Please sign in to comment.