Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
Browse files

cleaned up and added a filter

  • Loading branch information...
commit 00c93f05c8e135b2bb1e1d11fa22107724e216a5 1 parent d90aecf
@mattetti authored
View
18 README.md
@@ -32,6 +32,8 @@ many processors until it reaches its final form. If the data is meant to
be persisted, the persistence layer should be implemented as a processor.
Examples of processors are data extractors, event triggers, persistence
layers etc...
+Processors are called via `#process` and are usually implemented as
+modules since they don't need to create instances or keep states.
### Design
@@ -39,3 +41,19 @@ Each unit should be autonomous, easy to test and chainable. Raised
exceptions should really be exceptional and always mean that human
intervention is needed. Ideally, each unit should also be designed to
run concurrently.
+
+
+## Development
+
+To run the scenario:
+
+```
+$ ruby runner.rb
+```
+
+To test/play with the different parts, use the console:
+
+```
+bin/console
+```
+
View
7 config/episode_filter.yml
@@ -0,0 +1,7 @@
+---
+keep:
+- show_name: 64 rue du zoo
+- show_ref: mini_loup
+- show_name: Zou
+- show_ref: lulu_vroumette
+- show_ref: flapacha
View
26 processors/episode_filter.rb
@@ -0,0 +1,26 @@
+require 'yaml'
+
+# filters episodes based on a config file
+module EpisodeFilter
+
+ module_function
+
+ def process(episodes, config_file=nil)
+ if config_file.nil?
+ episodes
+ else
+ config = YAML.load_file(config_file)
+ keep_filter = config['keep']
+ episodes.select{|e| match_filter?(keep_filter, e) }
+ end
+ end
+
+ def match_filter?(filter, episode)
+ filter.any? do |f|
+ getter = f.keys[0]
+ value = f.values[0]
+ episode.send(getter) =~ Regexp.new(value, 'i')
+ end
+ end
+
+end
View
8 processors/episode_summary.rb
@@ -1,8 +1,8 @@
# Creates a formatted summary of a collection of episodes.
#
-class EpisodeSummary
+module EpisodeSummary
- attr_accessor :items
+ module_function
# Converts the passed episode items in a summary
# that is formatted based on the passed format.
@@ -11,7 +11,6 @@ class EpisodeSummary
# supported)
# @return [String]
def process(items, format=:html)
- self.items = items
if format == :html
html_header + "\n" + \
items.sort_by(&:show_name).map{|i| html_episode_summary(i)}.join("\n") + \
@@ -48,7 +47,8 @@ def html_episode_summary(item)
"' /img></div>") if item.image_url}
<a href="#{item.url}">
#{(item.notes.nil? || item.notes == "") ? 'link' : item.notes }
- </a>
+ </a>
+ #{("<span> Show ref: " + item.show_ref + "</span>") if item.show_ref}
</li>
EOS
end
View
4 processors/to_file.rb
@@ -1,6 +1,8 @@
require 'tempfile'
-class ToFile
+module ToFile
+
+ module_function
# Saves the passed content to a file.
# @param [#to_s] content The content to save to file.
View
11 runner.rb
@@ -9,8 +9,13 @@
Dir.glob("./processors/*.rb"){|file| require file }
FileUtils.mkdir_p('output')
-# TODO: use a scheduler and send to processors
+
+# Scrap
episodes = FranceTVJeunesse.run
-summary = EpisodeSummary.new.process(episodes)
+# 1st processor
+filtered_episodes = EpisodeFilter.process(episodes, "config/episode_filter.yml")
+# 2nd processor
+summary = EpisodeSummary.process(filtered_episodes)
+# 3rd processor
destination = File.join(File.expand_path(File.dirname(__FILE__)), "output", "summary_#{Time.now.strftime("%Y-%m-%d")}.html")
-puts ToFile.new.process(summary, destination)
+`open #{ToFile.process(summary, destination)}`
View
23 scrapers/interfaces.rb
@@ -0,0 +1,23 @@
+module EpisodeInterface
+ ATTRIBUTES = [:show_name, :show_ref,
+ :title, :url, :image_url, :broadcast_date, :notes ]
+
+ # Injects some accessors in the object including this module.
+ def self.included(base)
+ base.send(:attr_accessor, *EpisodeInterface::ATTRIBUTES)
+ base.send(:attr_reader, :failures)
+ end
+
+ def to_s
+ "show: #{show_name} - show ref: #{show_ref} - title: #{title} - url: #{url} - notes: #{notes} - failures: #{self.failures.join("\n")}"
+ end
+
+ def failed?
+ if self.url.nil? || !self.failures.empty?
+ true
+ else
+ false
+ end
+ end
+
+end
View
27 scrapers/pluzz_francetv_fr.rb
@@ -1,11 +1,13 @@
require_relative 'utils'
+require_relative 'interfaces'
require 'json'
+require 'date'
module FranceTVJeunesse
def self.run
agent = Mechanize.new
- url = "http://pluzz.francetv.fr/ajax/launchsearch/rubrique/jeunesse/datedebut/#{Time.now.strftime("%Y-%m-%dT00:00")}/datefin/#{Time.now.strftime("%Y-%m-%dT23:59")}/type/lesplusrecents/nb/100/"
+ url = "http://pluzz.francetv.fr/ajax/launchsearch/rubrique/jeunesse/datedebut/#{(Date.today - 1).strftime("%Y-%m-%dT00:00")}/datefin/#{Date.today.strftime("%Y-%m-%dT23:59")}/type/lesplusrecents/nb/200/"
page = agent.get(url)
episodes = fetch_episodes(page)
STDERR << "Error scraping #{url}" if episodes.find{|e| e.failed?}
@@ -26,15 +28,10 @@ def self.fetch_episodes(page)
end
episodes
end
-
+
class Episode
include Scrapbook::Utils::Fetcher
-
- ATTRIBUTES = [:show_name, :show_ref,
- :title, :url, :image_url, :broadcast_date, :notes ]
-
- attr_accessor *ATTRIBUTES
- attr_reader :failures
+ include EpisodeInterface
def initialize(opts=nil)
@failures = []
@@ -46,25 +43,13 @@ def initialize(opts=nil)
self
end
- def to_s
- "show: #{show_name} - show ref: #{show_ref} - title: #{title} - url: #{url} - notes: #{notes} - failures: #{self.failures.join("\n")}"
- end
-
def to_json
hash = {}
- ATTRIBUTES.each do |att|
+ EpisodeInterface::ATTRIBUTES.each do |att|
hash[att] = self.send(att)
end
hash.to_json
end
- def failed?
- if self.url.nil? || !self.failures.empty?
- true
- else
- false
- end
- end
-
end
end
Please sign in to comment.
Something went wrong with that request. Please try again.