Skip to content

Commit

Permalink
Group scraped text by element ID.
Browse files Browse the repository at this point in the history
  • Loading branch information
rybesh committed Mar 30, 2012
1 parent 9b4c692 commit dfa55f6
Show file tree
Hide file tree
Showing 2 changed files with 116 additions and 18 deletions.
50 changes: 32 additions & 18 deletions src/components/ScrapeHtml.coffee
Expand Up @@ -3,23 +3,36 @@ jsdom = require "jsdom"

class ScrapeHtml extends noflo.Component
constructor: ->
@html = ""
@html = []
@textSelector = ""
@crapSelectors = []
@ignoreSelectors = []

@inPorts =
in: new noflo.Port()
textSelector: new noflo.Port()
crapSelector: new noflo.ArrayPort()
ignoreSelector: new noflo.ArrayPort()
@outPorts =
out: new noflo.Port()
error: new noflo.Port()

html = ""
@inPorts.in.on "connect", =>
@html = []
@inPorts.in.on "begingroup", (group) =>
@outPorts.out.beginGroup group
@inPorts.in.on "data", (data) =>
html += data
@inPorts.in.on "endgroup", =>
@once "scraped", =>
@outPorts.out.endGroup()
@html.push html
html = ""
@scrapeHtml()
@inPorts.in.on "disconnect", =>
@html = html
@once "scraped", =>
@outPorts.out.disconnect()
return if @html.length > 0 # we are using groups
@html.push html
html = ""
@scrapeHtml()

Expand All @@ -28,21 +41,22 @@ class ScrapeHtml extends noflo.Component
@inPorts.textSelector.on "disconnect", =>
@scrapeHtml()

@inPorts.crapSelector.on "data", (data) =>
@crapSelectors.push data
@inPorts.ignoreSelector.on "data", (data) =>
@ignoreSelectors.push data

scrapeHtml: ->
return unless @html.length
return unless @textSelector.length
target = @outPorts.out
jsdom.env @html, ['http://code.jquery.com/jquery.min.js'], (err, win) =>
if err
@outPorts.error.send err
return @outPorts.error.disconnect()
win.$(crap).remove() for crap in @crapSelectors
for text in (win.$(@textSelector).map -> win.$(this).text())
@outPorts.out.send text
@outPorts.out.disconnect()
@html = ""
return unless @html.length > 0
return unless @textSelector.length > 0
for h in @html
jsdom.env h, ['http://code.jquery.com/jquery.min.js'], (err, win) =>
if err
@outPorts.error.send err
return @outPorts.error.disconnect()
win.$(ignore).remove() for ignore in @ignoreSelectors
win.$(@textSelector).map (i,e) =>
@outPorts.out.beginGroup e.id if e.hasAttribute "id"
@outPorts.out.send win.$(e).text()
@outPorts.out.endGroup() if e.hasAttribute "id"
@emit "scraped"

exports.getComponent = -> new ScrapeHtml
84 changes: 84 additions & 0 deletions test/ScrapeHtml.coffee
@@ -0,0 +1,84 @@
scrape = require "../src/components/ScrapeHtml"
socket = require "../src/lib/InternalSocket"

setupComponent = ->
c = scrape.getComponent()
ins = socket.createSocket()
out = socket.createSocket()
c.inPorts.in.attach ins
c.outPorts.out.attach out
return [c, ins, out]

exports["test selector then html"] = (test) ->
[c, ins, out] = setupComponent()
s = socket.createSocket()
c.inPorts.textSelector.attach s
expect = ["bar","baz"]
out.once "begingroup", (group) ->
test.fail "should not get groups without element ids"
out.on "data", (data) ->
test.equal data, expect.shift()
test.done() if expect.length == 0
s.send "p.test"
s.disconnect()
ins.send '<div><p>foo</p><p class="test">ba'
ins.send 'r</p><p class="test">baz</p></div>'
ins.disconnect()

exports["test html then selector"] = (test) ->
[c, ins, out] = setupComponent()
s = socket.createSocket()
c.inPorts.textSelector.attach s
expect = ["bar","baz"]
out.on "data", (data) ->
test.equal data, expect.shift()
test.done() if expect.length == 0
ins.send '<div><p>foo</p><p class="test">ba'
ins.send 'r</p><p class="test">baz</p></div>'
ins.disconnect()
s.send "p.test"
s.disconnect()

exports["test ignore"] = (test) ->
[c, ins, out] = setupComponent()
s = socket.createSocket()
i = socket.createSocket()
c.inPorts.textSelector.attach s
c.inPorts.ignoreSelector.attach i
expect = ["foo"]
out.on "data", (data) ->
test.equal data, expect.shift()
test.done() if expect.length == 0
i.send ".noise"
i.send "#crap"
i.disconnect()
ins.send '<div><p class="test">foo</p><p id="crap" class="test">ba'
ins.send 'r</p><p class="test noise">baz</p></div>'
ins.disconnect()
s.send "p.test"
s.disconnect()

exports["test group by element id"] = (test) ->
[c, ins, out] = setupComponent()
s = socket.createSocket()
c.inPorts.textSelector.attach s
expectevent = "begingroup"
expectgroup = ["a","b"]
out.on "begingroup", (group) ->
test.equal "begingroup", expectevent
test.equal group, expectgroup.shift()
expectevent = "data"
expectdata = ["bar","baz"]
out.on "data", (data) ->
test.equal "data", expectevent
test.equal data, expectdata.shift()
expectevent = "endgroup"
out.on "endgroup", ->
test.equal "endgroup", expectevent
expectevent = "begingroup"
test.done() if expectgroup.length == 0
s.send "p.test"
s.disconnect()
ins.send '<div><p>foo</p><p id="a" class="test">ba'
ins.send 'r</p><p id="b" class="test">baz</p></div>'
ins.disconnect()

0 comments on commit dfa55f6

Please sign in to comment.