Permalink
Browse files

Group scraped text by element ID.

  • Loading branch information...
1 parent 9b4c692 commit dfa55f6b5b5120e69869193709aaf362c815875a @rybesh rybesh committed Mar 30, 2012
Showing with 116 additions and 18 deletions.
  1. +32 −18 src/components/ScrapeHtml.coffee
  2. +84 −0 test/ScrapeHtml.coffee
@@ -3,23 +3,36 @@ jsdom = require "jsdom"
class ScrapeHtml extends noflo.Component
constructor: ->
- @html = ""
+ @html = []
@textSelector = ""
- @crapSelectors = []
+ @ignoreSelectors = []
@inPorts =
in: new noflo.Port()
textSelector: new noflo.Port()
- crapSelector: new noflo.ArrayPort()
+ ignoreSelector: new noflo.ArrayPort()
@outPorts =
out: new noflo.Port()
error: new noflo.Port()
html = ""
+ @inPorts.in.on "connect", =>
+ @html = []
+ @inPorts.in.on "begingroup", (group) =>
+ @outPorts.out.beginGroup group
@inPorts.in.on "data", (data) =>
html += data
+ @inPorts.in.on "endgroup", =>
+ @once "scraped", =>
+ @outPorts.out.endGroup()
+ @html.push html
+ html = ""
+ @scrapeHtml()
@inPorts.in.on "disconnect", =>
- @html = html
+ @once "scraped", =>
+ @outPorts.out.disconnect()
+ return if @html.length > 0 # we are using groups
+ @html.push html
html = ""
@scrapeHtml()
@@ -28,21 +41,22 @@ class ScrapeHtml extends noflo.Component
@inPorts.textSelector.on "disconnect", =>
@scrapeHtml()
- @inPorts.crapSelector.on "data", (data) =>
- @crapSelectors.push data
+ @inPorts.ignoreSelector.on "data", (data) =>
+ @ignoreSelectors.push data
scrapeHtml: ->
- return unless @html.length
- return unless @textSelector.length
- target = @outPorts.out
- jsdom.env @html, ['http://code.jquery.com/jquery.min.js'], (err, win) =>
- if err
- @outPorts.error.send err
- return @outPorts.error.disconnect()
- win.$(crap).remove() for crap in @crapSelectors
- for text in (win.$(@textSelector).map -> win.$(this).text())
- @outPorts.out.send text
- @outPorts.out.disconnect()
- @html = ""
+ return unless @html.length > 0
+ return unless @textSelector.length > 0
+ for h in @html
+ jsdom.env h, ['http://code.jquery.com/jquery.min.js'], (err, win) =>
+ if err
+ @outPorts.error.send err
+ return @outPorts.error.disconnect()
+ win.$(ignore).remove() for ignore in @ignoreSelectors
+ win.$(@textSelector).map (i,e) =>
+ @outPorts.out.beginGroup e.id if e.hasAttribute "id"
+ @outPorts.out.send win.$(e).text()
+ @outPorts.out.endGroup() if e.hasAttribute "id"
+ @emit "scraped"
exports.getComponent = -> new ScrapeHtml
View
@@ -0,0 +1,84 @@
+scrape = require "../src/components/ScrapeHtml"
+socket = require "../src/lib/InternalSocket"
+
+setupComponent = ->
+ c = scrape.getComponent()
+ ins = socket.createSocket()
+ out = socket.createSocket()
+ c.inPorts.in.attach ins
+ c.outPorts.out.attach out
+ return [c, ins, out]
+
+exports["test selector then html"] = (test) ->
+ [c, ins, out] = setupComponent()
+ s = socket.createSocket()
+ c.inPorts.textSelector.attach s
+ expect = ["bar","baz"]
+ out.once "begingroup", (group) ->
+ test.fail "should not get groups without element ids"
+ out.on "data", (data) ->
+ test.equal data, expect.shift()
+ test.done() if expect.length == 0
+ s.send "p.test"
+ s.disconnect()
+ ins.send '<div><p>foo</p><p class="test">ba'
+ ins.send 'r</p><p class="test">baz</p></div>'
+ ins.disconnect()
+
+exports["test html then selector"] = (test) ->
+ [c, ins, out] = setupComponent()
+ s = socket.createSocket()
+ c.inPorts.textSelector.attach s
+ expect = ["bar","baz"]
+ out.on "data", (data) ->
+ test.equal data, expect.shift()
+ test.done() if expect.length == 0
+ ins.send '<div><p>foo</p><p class="test">ba'
+ ins.send 'r</p><p class="test">baz</p></div>'
+ ins.disconnect()
+ s.send "p.test"
+ s.disconnect()
+
+exports["test ignore"] = (test) ->
+ [c, ins, out] = setupComponent()
+ s = socket.createSocket()
+ i = socket.createSocket()
+ c.inPorts.textSelector.attach s
+ c.inPorts.ignoreSelector.attach i
+ expect = ["foo"]
+ out.on "data", (data) ->
+ test.equal data, expect.shift()
+ test.done() if expect.length == 0
+ i.send ".noise"
+ i.send "#crap"
+ i.disconnect()
+ ins.send '<div><p class="test">foo</p><p id="crap" class="test">ba'
+ ins.send 'r</p><p class="test noise">baz</p></div>'
+ ins.disconnect()
+ s.send "p.test"
+ s.disconnect()
+
+exports["test group by element id"] = (test) ->
+ [c, ins, out] = setupComponent()
+ s = socket.createSocket()
+ c.inPorts.textSelector.attach s
+ expectevent = "begingroup"
+ expectgroup = ["a","b"]
+ out.on "begingroup", (group) ->
+ test.equal "begingroup", expectevent
+ test.equal group, expectgroup.shift()
+ expectevent = "data"
+ expectdata = ["bar","baz"]
+ out.on "data", (data) ->
+ test.equal "data", expectevent
+ test.equal data, expectdata.shift()
+ expectevent = "endgroup"
+ out.on "endgroup", ->
+ test.equal "endgroup", expectevent
+ expectevent = "begingroup"
+ test.done() if expectgroup.length == 0
+ s.send "p.test"
+ s.disconnect()
+ ins.send '<div><p>foo</p><p id="a" class="test">ba'
+ ins.send 'r</p><p id="b" class="test">baz</p></div>'
+ ins.disconnect()

0 comments on commit dfa55f6

Please sign in to comment.