Permalink
Browse files

added a bunch of stuff for the lessons

  • Loading branch information...
1 parent 494a901 commit 0f0dc6372d7416481ff383af97fa3d167c9a0690 @pauldix committed Aug 27, 2012
No changes.
View
@@ -0,0 +1,4 @@
+source :rubygems
+gem "ruby-readability", :require => 'readability'
+gem "json"
+gem "awesome_print"
View
@@ -0,0 +1,18 @@
+GEM
+ remote: http://rubygems.org/
+ specs:
+ awesome_print (1.0.2)
+ guess_html_encoding (0.0.4)
+ json (1.7.5)
+ nokogiri (1.5.5)
+ ruby-readability (0.5.4)
+ guess_html_encoding (>= 0.0.4)
+ nokogiri (>= 1.4.2)
+
+PLATFORMS
+ ruby
+
+DEPENDENCIES
+ awesome_print
+ json
+ ruby-readability
View
@@ -0,0 +1,40 @@
+require 'rubygems'
+require 'cassandra'
+
+db = Cassandra.new('big_data', '127.0.0.1:9160')
+
+# get a specific user's tags
+row = db.get(:user_tags, "paul")
+
+def tag_counts_from_row(row)
+ tags = {}
+
+ row.each_pair do |pair|
+ column, tag_count = pair
+ tag_name = column.parts.first
+
+ tags[tag_name] = tag_count
+ end
+
+ tags
+end
+
+# insert a new user
+db.add(:user_tags, "todd", 3, "postgres")
+
+tags = tag_counts_from_row(row)
+puts "paul - #{tags.inspect}"
+
+# output everyone's tags
+user_ids = []
+db.get_range(:user_tags, :batch_size => 10000) do |id|
+ user_ids << id
+end
+
+rows_with_ids = db.multi_get(:user_tags, user_ids)
+rows_with_ids.each do |row_with_id|
+ name, row = row_with_id
+
+ tags = tag_counts_from_row(row)
+ puts "#{name} - #{tags.inspect}"
+end
View
@@ -0,0 +1,23 @@
+require 'rubygems'
+require 'cassandra'
+
+db = Cassandra.new('big_data', '127.0.0.1:9160')
+
+user_ids = []
+db.get_range(:user_post_activity, :batch_size => 10000) do |id|
+ user_ids << id
+end
+
+rows_with_ids = db.multi_get(:user_post_activity, user_ids)
+
+rows_with_ids.each do |row_with_id|
+ user_id, columns = row_with_id
+
+ puts "user_id: #{user_id}"
+ columns.each do |column|
+ cassandra_column, post_id = column
+ activity_time = Time.at((cassandra_column.to_i >> 12) / 1_000_000)
+ puts "#{post_id} at #{activity_time}"
+ end
+ puts "***************"
+end
View
@@ -0,0 +1,34 @@
+require "rubygems"
+require "bundler/setup"
+require 'open-uri'
+require 'awesome_print'
+require 'json'
+require 'readability'
+
+# source = open('http://www.nytimes.com/2012/08/19/business/lawyers-of-big-tobacco-lawsuits-take-aim-at-food-industry.html').read
+# puts Readability::Document.new(source).content
+
+class HackerNewsApi
+ @base_items_url = "http://api.thriftdb.com/api.hnsearch.com/items/_search?sortby=create_ts%20desc&filter[fields][type]=submission&"
+
+ def self.search(options)
+ url = @base_items_url.dup
+ url << "q=#{options[:query]}"
+ url << "&filter[fields][domain]=#{options[:domain]}" if options.has_key?(:domain)
+ url << "&limit=#{options[:limit]}" if options.has_key?(:limit)
+ url << "&start=#{options[:start]}" if options.has_key?(:start)
+
+ json = JSON.parse(open(url).read)
+ ap json
+
+ json["results"].each do |result|
+ url = result["item"]["url"]
+ if url
+ raw_content = open(url).read
+ puts Readability::Document.new(raw_content).content
+ end
+ end
+ end
+end
+
+HackerNewsApi.search({:query => "facebook"})
View
@@ -0,0 +1,20 @@
+#!/usr/bin/env ruby
+
+STDIN.each do |line|
+ # only look at tags for posts, not questions
+ if line.index('PostTypeId="1"')
+
+ # this is the ghetto way we're going to pull the tags out of this string
+ tags_match = line.match(/Tags=(".*?")/)
+ if tags_match
+ tags = tags_match[0]
+ tags = tags.split("&gt;").map {|s| s.gsub(/.*\&lt\;/, '')}
+ tags = tags.slice(0, tags.length - 2)
+
+ # loop through and put to standard out in tab delimited for the reducer to pick up
+ tags.each do |tag|
+ puts "#{tag}\t1"
+ end
+ end
+ end
+end
View
@@ -0,0 +1,6 @@
+#!/usr/bin/env ruby
+
+STDIN.each do |line|
+ tag, count = line.strip.split("\t")
+ puts "#{count}\t#{tag}"
+end
@@ -0,0 +1,48 @@
+#!/usr/bin/env ruby
+
+require 'date'
+require 'rubygems'
+require 'cassandra-cql'
+require 'cassandra'
+
+db = Cassandra.new('big_data', '127.0.0.1:9160')
+
+# db = CassandraCQL::Database.new('127.0.0.1:9160', {:keyspace => 'big_data'})
+
+STDIN.each do |line|
+ parent_match = line.match(/\sParentId="(.*?")/)
+ if parent_match
+ post_id = parent_match[1].chomp('"')
+ else
+ match = line.match(/\sId="(.*?")/)
+ post_id = match[1].chomp('"') if match
+ end
+
+ if post_id
+ # TODO: add this in or take it out
+ is_question = line.index('PostTypeId="1"')
+ activity_type = is_question ? 1 : 2
+ # end
+
+ user_match = line.match(/\sOwnerUserId="(.*?")/)
+ creation_time_match = line.match(/\sCreationDate="(.*?")/)
+
+ if creation_time_match && user_match
+ user_id = user_match[1].chomp('"')
+
+ creation_time_string = creation_time_match[1].chomp('"')
+ creation_time = DateTime.parse(creation_time_string).to_time
+
+ db.insert(:user_post_activity, user_id, {creation_time => post_id})
+ # db.execute("UPDATE user_post_activity2 SET 'activity_type'=? WHERE 'user_id'=? AND 'time'=? AND 'post_id'=?",
+ # activity_type, user_id, creation_time_string, post_id)
+ # db.execute("UPDATE user_post_activity2 SET 'time'=?, 'post_id'=?, 'activity_type'=? WHERE 'user_id'=?",
+ # creation_time_string, post_id, activity_type, user_id)
+ # db.execute("INSERT INTO user_post_activity2 ('user_id', 'time', 'post_id', 'activity_type') VALUES (?, ?, ?, ?)",
+ # user_id, creation_time_string, post_id, activity_type)
+ puts "User #{user_id} on post #{post_id} at #{creation_time}"
+ else
+ puts "****************************"
+ end
+ end
+end
View
@@ -0,0 +1,22 @@
+#!/usr/bin/env ruby
+
+current_tag = nil
+current_count = 0
+
+STDIN.each do |line|
+ tag, count = line.split("\t")
+ count = count.to_i
+
+ # the shuffle between the map and reduce phase sorts the
+ # input to the reducer. So we know we'll be getting all
+ # the same tags together. Look for the change and output the
+ # total when it happens
+ if tag == current_tag
+ current_count += count
+ elsif current_tag
+ puts "#{current_tag}\t#{current_count}"
+ current_count = count
+ end
+
+ current_tag = tag
+end
View
@@ -0,0 +1,6 @@
+#!/usr/bin/env ruby
+
+STDIN.each do |line|
+ count, tag = line.strip.split("\t")
+ puts "#{tag}\t#{count}"
+end
Oops, something went wrong.

0 comments on commit 0f0dc63

Please sign in to comment.