Browse files

add delicious.com, Twitter JSON sample scripts

  • Loading branch information...
1 parent 03bdfe8 commit de45d03fdbce6198ceba1af6a5e9885c99018e13 @mislav committed Oct 24, 2009
Showing with 166 additions and 0 deletions.
  1. +4 −0 README.md
  2. +24 −0 Rakefile
  3. +37 −0 examples/delicious.rb
  4. +101 −0 examples/twitter.rb
View
4 README.md
@@ -26,6 +26,10 @@ Scraper
blog.articles.first.url
#=> "http://example.com/article"
+There are sample scripts in the "examples/" directory; run them with:
+
+ ruby -rubygems examples/<script>.rb
+
[See the wiki][wiki] for more on how to use *Scraper*.
Requirements
View
24 Rakefile
@@ -0,0 +1,24 @@
+task :default => :spec
+
+desc %(Run specs)
+task :spec do
+ exec %(ruby -rubygems scraper.rb --color)
+end
+
+desc %(Count lines of code in implementation)
+task :loc do
+ File.open('scraper.rb') do |file|
+ loc, counting = 1, false
+
+ file.each_line do |line|
+ case line
+ when /^class\b/ then counting = true
+ when /^\s*(#|\Z)/ then next
+ when /^end\b/ then break
+ end
+ loc += 1 if counting
+ end
+
+ puts loc
+ end
+end
View
37 examples/delicious.rb
@@ -0,0 +1,37 @@
+## Delicious bookmarks fetching
+#
+# Let's pretend that delicious.com doesn't have an API.
+# This is a demonstration of the most common use-case.
+
+require 'scraper'
+require 'open-uri'
+require 'date'
+
+# extracts data from a single bookmark
+class Bookmark < Scraper
+ element 'h4 a' => :title
+ element '.description' => :description
+
+ # extract attribute with xpath
+ element './/h4/a/@href' => :url
+
+ # tags are plural
+ elements 'ul.tag-chain li span' => :tags
+
+ # dates are in form "22 OCT 09"
+ element '.dateGroup span' => :date, :with => lambda { |span|
+ Date.strptime(span.inner_text.strip, '%d %b %y')
+ }
+end
+
+# finds all bookmarks on the page
+class Delicious < Scraper
+ elements '#bookmarklist div.bookmark' => :bookmarks, :with => Bookmark
+end
+
+mislav = Delicious.parse open('http://delicious.com/mislav/ruby')
+bookmark = mislav.bookmarks.first
+
+puts bookmark.title #=> "Some title"
+p bookmark.tags #=> ['foo', 'bar', ...]
+puts bookmark.date #=> <Date>
View
101 examples/twitter.rb
@@ -0,0 +1,101 @@
+## JSON data extraction example
+#
+# This is an example how we're not limited to Nokogiri and HTML screen-scraping.
+# Here we use Scraper to extract tweets from a Twitter API JSON response.
+#
+# Requirements: a JSON library (tested with "json" gem)
+
+require 'scraper'
+require 'json'
+require 'time'
+
+# a wrapper for JSON data that provides `at` and `search`
+class JsonDocument
+ def initialize(obj)
+ @data = String === obj ? JSON.parse(obj) : obj
+ end
+
+ def self.[](obj)
+ self.class === obj ? obj : new(obj)
+ end
+
+ def search(selector)
+ @data.to_a
+ end
+
+ def at(selector)
+ @data[selector]
+ end
+end
+
+# a scraper that works with JsonDocument
+class JsonScraper < Scraper
+ def self.convert_document(doc)
+ JsonDocument[doc]
+ end
+end
+
+# now here's the real deal
+class Twitter < JsonScraper
+ def self.convert_document(doc)
+ String === doc ? JsonDocument.new(doc) : doc
+ end
+
+ elements :tweets, :with => JsonScraper do
+ element :created_at
+ element :text
+ element :id
+ element 'user' => :author, :with => JsonScraper do
+ element 'name' => :full_name
+ element 'screen_name' => :username
+ end
+ end
+end
+
+twitter = Twitter.parse(DATA.read)
+
+twitter.tweets.each do |tweet|
+ puts "@%s: %s" % [tweet.author.username, tweet.text]
+ puts
+end
+
+
+__END__
+[{"created_at": "Thu Oct 22 23:50:02 +0000 2009",
+ "text":
+ "\"It is OK being wrong.\" \"I don't have any experience in that field.\"",
+ "id": 5083117521,
+ "user":
+ {"name": "Ryan Bigg",
+ "created_at": "Thu Apr 24 03:23:53 +0000 2008",
+ "location": "iPhone: -27.471957,152.999225",
+ "profile_image_url":
+ "http://a1.twimg.com/profile_images/287965508/Photo_47_normal.jpg",
+ "url": "http://www.frozenplague.net",
+ "id": 14506011,
+ "followers_count": 432,
+ "description": "I work at Mocra and code Ruby on Rails",
+ "statuses_count": 7659,
+ "friends_count": 211,
+ "screen_name": "ryanbigg"},
+ "source": "<a href=\"http://www.atebits.com/\" rel=\"nofollow\">Tweetie</a>"},
+ {"created_at": "Mon Oct 19 23:43:50 +0000 2009",
+ "text":
+ "Programming is the art of forcing the exceptions of the real world into the absolutes of a computer.",
+ "id": 5004137490,
+ "user":
+ {"name": "Ryan Bates",
+ "created_at": "Fri Mar 28 19:10:25 +0000 2008",
+ "location": "Southern Oregon",
+ "profile_image_url":
+ "http://a1.twimg.com/profile_images/52189024/ryan_bates_cropped_normal.jpg",
+ "url": "http://railscasts.com",
+ "id": 14246143,
+ "followers_count": 3225,
+ "description": "Producer of Railscasts - Free Ruby on Rails Screencasts",
+ "profile_background_image_url":
+ "http://s.twimg.com/a/1255724203/images/themes/theme2/bg.gif",
+ "statuses_count": 2066,
+ "friends_count": 225,
+ "screen_name": "rbates"}
+ }]

0 comments on commit de45d03

Please sign in to comment.