Skip to content

Commit

Permalink
Migrate from Scraper/JSDom/jQuery to Request/Cheerio
Browse files Browse the repository at this point in the history
  • Loading branch information
markdalgleish committed Sep 17, 2012
1 parent f1840b9 commit 0d014d5
Show file tree
Hide file tree
Showing 4 changed files with 184 additions and 162 deletions.
7 changes: 4 additions & 3 deletions lib/helper.js
Expand Up @@ -25,13 +25,14 @@ Helper.prototype = {
},

getText: function(selector) {
var $results = this.$(selector);
var $results = this.$(selector),
trimmedText = $results.text().trim();

if ($results.length === 0 || this.$.trim($results.text()) === '') {
if ($results.length === 0 || trimmedText === '') {
return undefined;
}

return this.$.trim($results.text());
return trimmedText;
},

getHref: function(selector) {
Expand Down
225 changes: 120 additions & 105 deletions lib/lanyrd-scraper.js
Expand Up @@ -6,143 +6,158 @@
* Licensed under the MIT license.
*/

var scraper = require('scraper'),
var request = require('request'),
cheerio = require('cheerio'),
Helper = require('./helper');

exports.scrape = function(url, callback) {
var helper = new Helper();
var parseEvent = function(markup, callback) {
var $ = cheerio.load(markup),
helper = new Helper();

url = helper.resolveUrl(url);
helper.setContext($);

scraper(url, function(err, $) {
if (err) {
callback(err, null);
return;
}
if ($('h1').text().trim() === '404: Whoops!') {
callback(new Error('Event not found'), null);
return;
}

if ($.trim($('h1').text()) === '404: Whoops!') {
callback(new Error('Event not found'), null);
return;
}
if ($('h1.summary').length === 0) {
callback(new Error('Invalid page format'), null);
return;
}

if ($('h1.summary').length === 0) {
callback(new Error('Invalid page format'), null);
return;
}
helper.setContext($);

helper.setContext($);
var data = {};

var data = {};
data.title = helper.getText('h1.summary');
data.tagline = helper.getText('h2.tagline');

data.title = helper.getText('h1.summary');
data.tagline = helper.getText('h2.tagline');
data.websiteUrl = helper.getHref('.item-meta a.website');
data.scheduleUrl = helper.getHref('.item-meta a.seeschedule');

data.websiteUrl = helper.getHref('.item-meta a.website');
data.scheduleUrl = helper.getHref('.item-meta a.seeschedule');
data.location = '' + helper.getText('.prominent-place .sub-place') + ', ' + helper.getText('.prominent-place .place-context a');
if (data.location === ', ') {
data.location = undefined;
}

data.location = '' + helper.getText('.prominent-place .sub-place') + ', ' + helper.getText('.prominent-place .place-context a');
if (data.location === ', ') {
data.location = undefined;
}
// Parse the address and notes
var addressValues = $('#venues p').eq(1).text().split('- ');
var address = addressValues[0].trim();
addressValues.shift();
var note = addressValues.length > 0 ? addressValues.join('- ').trim() : undefined;

// Parse the address and notes
var addressValues = $('#venues p:eq(1)').text().split('- ');
var address = $.trim(addressValues[0]);
addressValues.shift();
var note = addressValues.length > 0 ? $.trim(addressValues.join('- ')) : undefined;
data.venues = [];

data.venues = [];
data.venues.push({
name: helper.getText('#venues h3 a'),
address: address,
note: note,
googleMapsUrl: helper.getHref('#venues a.map-icon')
});

data.venues.push({
name: helper.getText('#venues h3 a'),
address: address,
note: note,
googleMapsUrl: helper.getHref('#venues a.map-icon')
});
data.startDate = helper.getTitle('.main-date .dtstart');
data.endDate = helper.getTitle('.main-date .dtend');
data.time = helper.getText('.main-date abbr.dtstart .time');

data.startDate = helper.getTitle('.main-date .dtstart');
data.endDate = helper.getTitle('.main-date .dtend');
data.time = helper.getText('.main-date abbr.dtstart .time');
data.hashtag = helper.getHashtag('.item-meta a.hashtag');
data.twitterHandle = helper.getTwitterHandle('.item-meta a.twitter');

data.hashtag = helper.getHashtag('.item-meta a.hashtag');
data.twitterHandle = helper.getTwitterHandle('.item-meta a.twitter');
data.speakers = [];
$('#speaker-list li').each(function(){
var speaker = {},
$this = $(this);

data.speakers = [];
$('#speaker-list li').each(function(){
var speaker = {},
$this = $(this);
speaker.name = $this.find('span.name').text();
speaker.twitterHandle = helper.getTwitterHandle($this.find('span.handle'));

speaker.name = $this.find('span.name').text();
speaker.twitterHandle = helper.getTwitterHandle($this.find('span.handle'));
data.speakers.push(speaker);
});

data.speakers.push(speaker);
data.sessions = [];
$('li.session-detail').each(function(){
var $this = $(this),
$titleLink = $this.find('h3 a'),
$link = $this.find('p a'),
$time = $this.find('.time');

var session = {
title: $titleLink.text(),
url: helper.getHref($titleLink),
startTime: helper.getText($time),
speakers: []
};

var $paragraph = $this.find('p');

// Remove 'presented by'
$paragraph.find('strong').remove();

// Loop through all speakers with accounts, remove tags from DOM
$paragraph.find('a').remove().each(function(){
var $link = $(this);

session.speakers.push({
name: $link.text(),
twitterHandle: helper.getTwitterHandle($link)
});
});

data.sessions = [];
$('li.session-detail').each(function(){
var $this = $(this),
$titleLink = $this.find('h3 a'),
$link = $this.find('p a'),
$time = $this.find('.time');

var session = {
title: $titleLink.text(),
url: helper.getHref($titleLink),
startTime: helper.getText($time),
speakers: []
};

var $paragraph = $this.find('p');

// Remove 'presented by'
$paragraph.find('strong').remove();

// Loop through all speakers with accounts, remove tags from DOM
$paragraph.find('a').remove().each(function(){
var $link = $(this);

// Convert remaining names into an array
$paragraph.text().replace(' and ', ', ').split(', ').forEach(function(speakerName){
speakerName = speakerName.trim();

if (speakerName !== '') {
session.speakers.push({
name: $link.text(),
twitterHandle: helper.getTwitterHandle($link)
name: speakerName
});
});
}
});

// Convert remaining names into an array
$paragraph.text().replace(' and ', ', ').split(', ').forEach(function(speakerName){
speakerName = $.trim(speakerName);
data.sessions.push(session);
});

if (speakerName !== '') {
session.speakers.push({
name: speakerName
});
}
});
data.attendees = [];
$('.attendees-placeholder li').each(function(){
var attendee = {},
$this = $(this);

data.sessions.push(session);
});
attendee.name = $this.find('img:first').attr('alt');
attendee.twitterHandle = helper.getTwitterHandle($this.find('a'));

data.attendees = [];
$('.attendees-placeholder li').each(function(){
var attendee = {},
$this = $(this);
data.attendees.push(attendee);
});

attendee.name = $this.find('img:first').attr('alt');
attendee.twitterHandle = helper.getTwitterHandle($this.find('a'));
data.trackers = [];
$('.trackers-placeholder li').each(function(){
var tracker = {},
$this = $(this);

data.attendees.push(attendee);
});
tracker.name = $this.find('img:first').attr('alt');
tracker.twitterHandle = helper.getTwitterHandle($this.find('a'));

data.trackers = [];
$('.trackers-placeholder li').each(function(){
var tracker = {},
$this = $(this);
data.trackers.push(tracker);
});

tracker.name = $this.find('img:first').attr('alt');
tracker.twitterHandle = helper.getTwitterHandle($this.find('a'));
callback(null, data);
};

data.trackers.push(tracker);
});
var scrape = function(url, callback) {
var helper = new Helper();

callback(null, data);
url = helper.resolveUrl(url);

request(url, function(err, resp, body) {
if (err) {
callback(err, null);
return;
}

parseEvent(body, callback);
});
};

module.exports = {
scrape: scrape,
parseEvent: parseEvent
};
72 changes: 36 additions & 36 deletions package.json
@@ -1,38 +1,38 @@
{
"name": "lanyrd-scraper",
"description": "Scraper for Lanyrd events",
"version": "0.1.1",
"author": {
"name": "Mark Dalgleish",
"url": "http://markdalgleish.com"
},
"repository": {
"type": "git",
"url": "git://github.com/markdalgleish/node-lanyrd-scraper.git"
},
"bugs": {
"url": "https://github.com/markdalgleish/node-lanyrd-scraper/issues"
},
"licenses": [
{
"type": "MIT",
"url": "http://markdalgleish.mit-license.org"
}
],
"main": "lib/lanyrd-scraper",
"engines": {
"node": ">=0.6"
},
"scripts": {
"test": "grunt lint && grunt test"
},
"dependencies": {
"scraper": "0.0.9"
},
"devDependencies": {
"grunt": "~0.3.9",
"express": "~2.5.11",
"jquery": "~1.7.3"
},
"keywords": []
"name": "lanyrd-scraper",
"description": "Scraper for Lanyrd events",
"version": "0.1.1",
"author": {
"name": "Mark Dalgleish",
"url": "http://markdalgleish.com"
},
"repository": {
"type": "git",
"url": "git://github.com/markdalgleish/node-lanyrd-scraper.git"
},
"bugs": {
"url": "https://github.com/markdalgleish/node-lanyrd-scraper/issues"
},
"licenses": [
{
"type": "MIT",
"url": "http://markdalgleish.mit-license.org"
}
],
"main": "lib/lanyrd-scraper",
"engines": {
"node": ">=0.6"
},
"scripts": {
"test": "grunt lint && grunt test"
},
"dependencies": {
"cheerio": "~0.9.2",
"request": "~2.11.1"
},
"devDependencies": {
"grunt": "~0.3.9",
"express": "~2.5.11"
},
"keywords": []
}

0 comments on commit 0d014d5

Please sign in to comment.