forked from infochimps-labs/ironfan-homebase
/
hadoop_demo.rb
113 lines (102 loc) · 3.74 KB
/
hadoop_demo.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
#
#
# * persistent HDFS --
#
# if you're testing, these recipes *will* work on a t1.micro. just don't use it for anything.
#
Ironfan.cluster 'hadoop_demo' do
cloud(:ec2) do
defaults
availability_zones ['us-east-1d']
flavor 'm1.large'
backing 'ebs'
image_name 'ironfan-natty'
bootstrap_distro 'ubuntu10.04-ironfan'
mount_ephemerals(:tags => {
:hadoop_scratch => true,
:hadoop_data => true, # remove this if you use the volume at bottom
})
end
# # uncomment if you want to set your environment.
# environment :prod
role :systemwide
role :chef_client
role :ssh
role :nfs_client
role :volumes
role :package_set, :last
role :minidash, :last
role :org_base
role :org_final, :last
role :org_users
role :hadoop
role :hadoop_s3_keys
role :tuning
role :jruby
role :pig
recipe 'hadoop_cluster::config_files', :last
facet :master do
instances 1
role :hadoop_namenode
role :hadoop_jobtracker
role :hadoop_secondarynn
role :hadoop_tasktracker
role :hadoop_datanode
end
facet :worker do
instances 4
role :hadoop_datanode
role :hadoop_tasktracker
end
cluster_role.override_attributes({
:hadoop => {
# # adjust these
# :java_heap_size_max => 1400,
# :namenode => { :java_heap_size_max => 1000, },
# :secondarynn => { :java_heap_size_max => 1000, },
# :jobtracker => { :java_heap_size_max => 3072, },
# :datanode => { :java_heap_size_max => 1400, },
# :tasktracker => { :java_heap_size_max => 1400, },
# # if you decommission nodes for elasticity, crank this up
# :balancer => { :max_bandwidth => (50 * 1024 * 1024) },
# # make mid-flight data much smaller -- useful esp. with ec2 network constraints
:compress_mapout_codec => 'org.apache.hadoop.io.compress.SnappyCodec',
}
})
# Launch the cluster with all of the below set to 'stop'.
#
# After initial bootstrap,
# * set the run_state to :start in the lines below
# * run `knife cluster sync hadoop_demo-master` to push those values up to chef
# * run `knife cluster kick hadoop_demo-master` to re-converge
#
# Once you see 'nodes=1' on jobtracker (host:50030) & namenode (host:50070)
# control panels, you're good to launch the rest of the cluster.
#
facet(:master).facet_role.override_attributes({
:hadoop => {
:namenode => { :run_state => :stop, },
:secondarynn => { :run_state => :stop, },
:jobtracker => { :run_state => :stop, },
:datanode => { :run_state => :stop, },
:tasktracker => { :run_state => :stop, },
},
})
# #
# # Attach persistent storage to each node, and use it for all hadoop data_dirs.
# #
# # Modify the snapshot ID and attached volume size to suit
# #
# volume(:ebs1) do
# defaults
# size 200
# keep true
# device '/dev/sdj' # note: will appear as /dev/xvdj on natty
# mount_point '/data/ebs1'
# attachable :ebs
# snapshot_name :blank_xfs
# resizable true
# tags( :hadoop_data => true, :persistent => true, :local => false, :bulk => true, :fallback => false )
# create_at_launch true
# end
end