Skip to content
This repository

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse code

added another scraper example

  • Loading branch information...
commit 8d19ae8a8712a8e00d6ff9039d57ffc7817d3ac9 1 parent 00c93f0
Matt Aimonetti authored
11 processors/episode_summary.rb
@@ -12,9 +12,14 @@ module EpisodeSummary
12 12 # @return [String]
13 13 def process(items, format=:html)
14 14 if format == :html
15   - html_header + "\n" + \
16   - items.sort_by(&:show_name).map{|i| html_episode_summary(i)}.join("\n") + \
17   - html_footer
  15 + output = html_header + "\n"
  16 + if items[0] && items[0].show_name
  17 + output += items.sort_by(&:show_name).map{|i| html_episode_summary(i)}.join("\n")
  18 + else
  19 + output += items.sort_by(&:title).map{|i| html_episode_summary(i)}.join("\n")
  20 + end
  21 + output += html_footer
  22 + output
18 23 elsif format == :json
19 24 items.map(&:to_json)
20 25 else
15 runner.rb
@@ -10,12 +10,21 @@
10 10
11 11 FileUtils.mkdir_p('output')
12 12
  13 +### FranceTV ###
13 14 # Scrap
14 15 episodes = FranceTVJeunesse.run
15 16 # 1st processor
16   -filtered_episodes = EpisodeFilter.process(episodes, "config/episode_filter.yml")
  17 +filtered_episodes = episodes #EpisodeFilter.process(episodes, "config/episode_filter.yml")
17 18 # 2nd processor
18 19 summary = EpisodeSummary.process(filtered_episodes)
19 20 # 3rd processor
20   -destination = File.join(File.expand_path(File.dirname(__FILE__)), "output", "summary_#{Time.now.strftime("%Y-%m-%d")}.html")
21   -`open #{ToFile.process(summary, destination)}`
  21 +destination = File.join(File.expand_path(File.dirname(__FILE__)), "output", "francetv_summary_#{Time.now.strftime("%Y-%m-%d")}.html")
  22 +puts ToFile.process(summary, destination)
  23 +
  24 +### eztv.it ###
  25 +# Scrap
  26 +episodes = EzTV.run
  27 +summary = EpisodeSummary.process(episodes)
  28 +# 3rd processor
  29 +destination = File.join(File.expand_path(File.dirname(__FILE__)), "output", "eztv_summary_#{Time.now.strftime("%Y-%m-%d")}.html")
  30 +puts ToFile.process(summary, destination)
45 scrapers/eztv_it.rb
... ... @@ -0,0 +1,45 @@
  1 +require_relative 'utils'
  2 +require_relative 'interfaces'
  3 +require 'json'
  4 +
  5 +module EzTV
  6 +
  7 + def self.run
  8 + agent = Mechanize.new
  9 + url = "http://eztv.it/sort/100/"
  10 + page = agent.get(url)
  11 + episodes = fetch_episodes(page)
  12 + STDERR << "Error scraping #{url} - #{episodes.inspect}\n" if episodes.find{|e| e.failed?}
  13 + episodes
  14 + end
  15 +
  16 + def self.fetch_episodes(page)
  17 + elements = page.search("table.forum_header_border tr.forum_header_border")
  18 + episodes = elements.map do |e|
  19 + episode = Episode.new
  20 + episode.url = episode.fetch(e, "a.magnet", ->(el){ el.first.attributes['href'].value})
  21 + episode.title = episode.fetch(e, "a.epinfo", ->(el){ el.first.attributes["title"].value.strip})
  22 + episode
  23 + end
  24 + episodes
  25 + end
  26 +
  27 + class Episode
  28 + include Scrapbook::Utils::Fetcher
  29 + include EpisodeInterface
  30 +
  31 + def initialize
  32 + @failures = []
  33 + end
  34 +
  35 + def to_json
  36 + hash = {}
  37 + EpisodeInterface::ATTRIBUTES.each do |att|
  38 + hash[att] = self.send(att)
  39 + end
  40 + hash.to_json
  41 + end
  42 +
  43 + end
  44 +end
  45 +
2  scrapers/interfaces.rb
@@ -13,7 +13,7 @@ def to_s
13 13 end
14 14
15 15 def failed?
16   - if self.url.nil? || !self.failures.empty?
  16 + if self.url.nil? || (self.failures && !self.failures.empty?)
17 17 true
18 18 else
19 19 false

0 comments on commit 8d19ae8

Please sign in to comment.
Something went wrong with that request. Please try again.