Part 1 Install Cassandra:
# note that the debian packages will automatically install open-jdk. You'll want to avoid that because of hadoop.
tar xvzf dsc.tar.gz
cd dsc-cassandra-1.1.2/lib
sudo bin/cassandra
describe cluster;
Part 2 Setup Cassandra Schema:
./bin/cqlsh --cql3
WITH strategy_class = 'org.apache.cassandra.locator.SimpleStrategy'
AND strategy_options:replication_factor='1';
use big_data;
CREATE TABLE user_tags (
user_id varchar,
tag varchar,
value counter,
PRIMARY KEY (user_id, tag)
update user_tags set value=value+1 where user_id = 'paul' and tag = 'cassandra';
select * from user_tags;
post_id varchar PRIMARY KEY,
title varchar,
body varchar,
post_type ascii,
tags varchar,
owner_id ascii,
created_date timestamp
Part 3 Ruby to Cassandra:
sudo apt-get install build-essential # thrift is a native wrapper
sudo gem install cassandra
Part 4 Write data into Cassandra from Hadoop:
# let's create a column family to act as an index on what posts users have interacted with
create column family user_post_activity
with column_type = 'Standard'
and comparator = 'LongType';
hadoop jar /usr/lib/hadoop-0.20-mapreduce/contrib/streaming/hadoop-streaming-2.0.0-mr1-cdh4.0.1.jar \
-D mapreduce.job.reduces=0 \
-input input/posts_small.xml \
-output insert_user_post_activity_job \
-mapper mapper_user_posts.rb \
-file mapper_user_posts.rb
ruby get_user_posts.rb
CREATE TABLE user_post_activity2 (
user_id ascii,
time timestamp,
post_id ascii,
activity_type int,
PRIMARY KEY (user_id, time, post_id)
sudo gem install cassandra-cql
Part 5 Use how Hadoop Reduces work to parallelize writes:
look at mapper_parse_posts.rb
look at reducer_write_posts.rb
hadoop jar /usr/lib/hadoop-0.20-mapreduce/contrib/streaming/hadoop-streaming-2.0.0-mr1-cdh4.0.1.jar \
-D mapreduce.job.reduces=2 \
-input input/posts_small.xml \
-output insert_posts_job3 \
-mapper mapper_parse_posts.rb \
-reducer reducer_write_posts.rb \
-file mapper_parse_posts.rb \
-file reducer_write_posts.rb \
-numReduceTasks 2