Skip to content

Commit

Permalink
Updated example code
Browse files Browse the repository at this point in the history
  • Loading branch information
petewarden committed Mar 25, 2012
1 parent 1677f27 commit e8ec5eb
Show file tree
Hide file tree
Showing 3 changed files with 47 additions and 50 deletions.
1 change: 1 addition & 0 deletions example_input.txt
@@ -0,0 +1 @@
s3://aws-publicdatasets/common-crawl/crawl-002/2009/11/21/0/1258794327404_0.arc.gz
90 changes: 44 additions & 46 deletions extension_map.rb
Expand Up @@ -57,56 +57,54 @@ def each

# All these warnings will end up in the EMR stderr logs.
warn "Starting up, using #{CHUNKSIZE/1024}KB chunks for download."

# CHANGEME! - You'll need to put your own Amazon keys in here
s3=AWS::S3.new(
:access_key_id=>'AKIAJQROFLMAOV2DV7NA',
:secret_access_key=>'eSNw1IxDr0yeah+as+if0Hr+WJuoKhPNzQnKVTO3K'
:access_key_id=>'',
:secret_access_key=>''
)

ARGF.each_line {|line|
begin
warn "Starting work on #{line.chomp}"
# expect a line like this:
# s3://commoncrawl-crawl-002/2010/09/24/9/1285380159663_9.arc.gz
proto,unused,bucket_name,*rest=line.chomp.split File::SEPARATOR
raise ArgumentError, "#{__FILE__}: Unknown S3 Protocol #{proto}" unless proto=~/^s3/
object_name=File.join rest

size=Integer( s3.buckets[bucket_name].objects[object_name].content_length )
warn "Reading from #{bucket_name.inspect}, #{object_name.inspect}, size #{size}"
ranges=(0..size).each_slice( CHUNKSIZE ).map {|ary| (ary.first..ary.last)}

# Ruby GzipReader is unable to unzip these files, but unix gunzip can
# Also means we don't need to eat much RAM, because everything is streaming.
Open3.popen3( 'gunzip -c' ) {|sin,sout,serr,thr|

# Create an ArcFile instance which will receive gunzip's stdout
arcfile=ArcFile.new sout

Thread.new do
# Download chunks in the background and pipe them into gunzip
# as we receive them
ranges.each {|target_range|
retry_count=5
begin
chunk=s3.buckets[bucket_name].objects[object_name].read( :range => target_range )
rescue
raise $! if (retry_count-=1)<0
warn "Error (#{$!}) downloading #{target_range}, retrying."
sleep 1 and retry
end
sin.write chunk
Thread.pass
}
sin.close # which will send an EOF to the ArcFile
end

# Now we have a lazy ArcFile that we can treat as an Enumerable.
arcfile.each {|header, body|
# mimetype and URL extension (but don't keep ? params to php urls etc)
puts( "#{header.split[3]}".ljust(25) << "#{File.extname( header.split.first ).split('?').first}".ljust(15) )
warn "Starting work on #{line.chomp}"
# expect a line like this:
# s3://commoncrawl-crawl-002/2010/09/24/9/1285380159663_9.arc.gz
proto,unused,bucket_name,*rest=line.chomp.split File::SEPARATOR
raise ArgumentError, "#{__FILE__}: Unknown S3 Protocol #{proto}" unless proto=~/^s3/
object_name=File.join rest

size=Integer( s3.buckets[bucket_name].objects[object_name].content_length )
warn "Reading from #{bucket_name.inspect}, #{object_name.inspect}, size #{size}"
ranges=(0..size).each_slice( CHUNKSIZE ).map {|ary| (ary.first..ary.last)}

# Ruby GzipReader is unable to unzip these files, but unix gunzip can
# Also means we don't need to eat much RAM, because everything is streaming.
Open3.popen3( 'gunzip -c' ) {|sin,sout,serr,thr|

# Create an ArcFile instance which will receive gunzip's stdout
arcfile=ArcFile.new sout

Thread.new do
# Download chunks in the background and pipe them into gunzip
# as we receive them
ranges.each {|target_range|
retry_count=5
begin
chunk=s3.buckets[bucket_name].objects[object_name].read( :range => target_range )
rescue
raise $! if (retry_count-=1)<0
warn "Error (#{$!}) downloading #{target_range}, retrying."
sleep 1 and retry
end
sin.write chunk
Thread.pass
}
sin.close # which will send an EOF to the ArcFile
end

# Now we have a lazy ArcFile that we can treat as an Enumerable.
arcfile.each {|header, body|
# mimetype and URL extension (but don't keep ? params to php urls etc)
puts( "#{header.split[3]}".ljust(25) << "#{File.extname( header.split.first ).split('?').first}".ljust(15) )
}
rescue
warn "Failed to process #{line}: #{$!}"
end
}
}
6 changes: 2 additions & 4 deletions setup.sh
Expand Up @@ -3,15 +3,13 @@
echo 'moving forward in time to a year starting with 2'
sudo apt-get update
sudo apt-get -y upgrade
echo 'unsuckifying system by installing basic libs that should have
been there in the first place'
echo 'unsuckifying system by installing basic libs that should have been there in the first place'
sudo apt-get install -y build-essential openssl libreadline6 libreadline6-dev curl git-core zlib1g zlib1g-dev libssl-dev libyaml-dev libxml2-dev libxslt-dev
echo 'done'
echo 'installing ruby and rubygems'
sudo apt-get -y -t universe install ruby rubygems
echo 'done'
echo 'installing AWS Ruby SDK which should also have been there in the
first place'
echo 'installing AWS Ruby SDK which should also have been there in the first place'
sudo gem install aws-sdk --source http://rubygems.org
echo 'done'
echo 'OK, good to go'
Expand Down

0 comments on commit e8ec5eb

Please sign in to comment.