Skip to content

Commit

Permalink
Simplify entry point, factor out parsers to allow more types
Browse files Browse the repository at this point in the history
  • Loading branch information
markdalgleish committed Sep 17, 2012
1 parent 74bfc98 commit ea1e43a
Show file tree
Hide file tree
Showing 5 changed files with 294 additions and 313 deletions.
140 changes: 7 additions & 133 deletions lib/lanyrd-scraper.js
Expand Up @@ -8,151 +8,25 @@

var request = require('request'),
cheerio = require('cheerio'),
resolve = require('./urlresolver').resolveUrl,
Page = require('./page');

var parseEvent = function(markup, callback) {
var $ = cheerio.load(markup),
page = new Page($);

if ($('h1').text().trim() === '404: Whoops!') {
callback(new Error('Event not found'), null);
return;
}

if ($('h1.summary').length === 0) {
callback(new Error('Invalid page format'), null);
return;
}

var data = {};

data.title = page.getText('h1.summary');
data.tagline = page.getText('h2.tagline');

data.websiteUrl = page.getHref('.item-meta a.website');
data.scheduleUrl = page.getHref('.item-meta a.seeschedule');

data.location = '' + page.getText('.prominent-place .sub-place') + ', ' + page.getText('.prominent-place .place-context a');
if (data.location === ', ') {
data.location = undefined;
}

// Parse the address and notes
var addressValues = $('#venues p').eq(1).text().split('- ');
var address = addressValues[0].trim();
addressValues.shift();
var note = addressValues.length > 0 ? addressValues.join('- ').trim() : undefined;

data.venues = [];

data.venues.push({
name: page.getText('#venues h3 a'),
address: address,
note: note,
googleMapsUrl: page.getHref('#venues a.map-icon')
});

data.startDate = page.getTitle('.main-date .dtstart');
data.endDate = page.getTitle('.main-date .dtend');
data.time = page.getText('.main-date abbr.dtstart .time');

data.hashtag = page.getHashtag('.item-meta a.hashtag');
data.twitterHandle = page.getTwitterHandle('.item-meta a.twitter');

data.speakers = [];
$('#speaker-list li').each(function(){
var speaker = {},
$this = $(this);

speaker.name = $this.find('span.name').text();
speaker.twitterHandle = page.getTwitterHandle($this.find('span.handle'));

data.speakers.push(speaker);
});

data.sessions = [];
$('li.session-detail').each(function(){
var $this = $(this),
$titleLink = $this.find('h3 a'),
$link = $this.find('p a'),
$time = $this.find('.time');

var session = {
title: $titleLink.text(),
url: page.getHref($titleLink),
startTime: page.getText($time),
speakers: []
};

var $paragraph = $this.find('p');

// Remove 'presented by'
$paragraph.find('strong').remove();

// Loop through all speakers with accounts, remove tags from DOM
$paragraph.find('a').remove().each(function(){
var $link = $(this);

session.speakers.push({
name: $link.text(),
twitterHandle: page.getTwitterHandle($link)
});
});

// Convert remaining names into an array
$paragraph.text().replace(' and ', ', ').split(', ').forEach(function(speakerName){
speakerName = speakerName.trim();

if (speakerName !== '') {
session.speakers.push({
name: speakerName
});
}
});

data.sessions.push(session);
});

data.attendees = [];
$('.attendees-placeholder li').each(function(){
var attendee = {},
$this = $(this);

attendee.name = $this.find('img:first').attr('alt');
attendee.twitterHandle = page.getTwitterHandle($this.find('a'));

data.attendees.push(attendee);
});

data.trackers = [];
$('.trackers-placeholder li').each(function(){
var tracker = {},
$this = $(this);

tracker.name = $this.find('img:first').attr('alt');
tracker.twitterHandle = page.getTwitterHandle($this.find('a'));

data.trackers.push(tracker);
});

callback(null, data);
};
urlResolver = require('./urlresolver'),
Page = require('./page'),
parse = require('./parsers');

var scrape = function(url, callback) {
url = resolve(url);
url = urlResolver.resolveUrl(url);

request(url, function(err, resp, body) {
if (err) {
callback(err, null);
return;
}

parseEvent(body, callback);
var pageType = urlResolver.resolvePageType(url) || 'event';
parse[pageType](body, callback);
});
};

module.exports = {
scrape: scrape,
parseEvent: parseEvent
parse: parse
};
121 changes: 121 additions & 0 deletions lib/parsers/event.js
@@ -0,0 +1,121 @@
var cheerio = require('cheerio'),
Page = require('../page');

var parse = function(markup, callback) {
var $ = cheerio.load(markup),
page = new Page($),
data = {};

data.title = page.getText('h1.summary');
data.tagline = page.getText('h2.tagline');

data.websiteUrl = page.getHref('.item-meta a.website');
data.scheduleUrl = page.getHref('.item-meta a.seeschedule');

data.location = '' + page.getText('.prominent-place .sub-place') + ', ' + page.getText('.prominent-place .place-context a');
if (data.location === ', ') {
data.location = undefined;
}

// Parse the address and notes
var addressValues = $('#venues p').eq(1).text().split('- ');
var address = addressValues[0].trim();
addressValues.shift();
var note = addressValues.length > 0 ? addressValues.join('- ').trim() : undefined;

data.venues = [];

data.venues.push({
name: page.getText('#venues h3 a'),
address: address,
note: note,
googleMapsUrl: page.getHref('#venues a.map-icon')
});

data.startDate = page.getTitle('.main-date .dtstart');
data.endDate = page.getTitle('.main-date .dtend');
data.time = page.getText('.main-date abbr.dtstart .time');

data.hashtag = page.getHashtag('.item-meta a.hashtag');
data.twitterHandle = page.getTwitterHandle('.item-meta a.twitter');

data.speakers = [];
$('#speaker-list li').each(function(){
var speaker = {},
$this = $(this);

speaker.name = $this.find('span.name').text();
speaker.twitterHandle = page.getTwitterHandle($this.find('span.handle'));

data.speakers.push(speaker);
});

data.sessions = [];
$('li.session-detail').each(function(){
var $this = $(this),
$titleLink = $this.find('h3 a'),
$link = $this.find('p a'),
$time = $this.find('.time');

var session = {
title: $titleLink.text(),
url: page.getHref($titleLink),
startTime: page.getText($time),
speakers: []
};

var $paragraph = $this.find('p');

// Remove 'presented by'
$paragraph.find('strong').remove();

// Loop through all speakers with accounts, remove tags from DOM
$paragraph.find('a').remove().each(function(){
var $link = $(this);

session.speakers.push({
name: $link.text(),
twitterHandle: page.getTwitterHandle($link)
});
});

// Convert remaining names into an array
$paragraph.text().replace(' and ', ', ').split(', ').forEach(function(speakerName){
speakerName = speakerName.trim();

if (speakerName !== '') {
session.speakers.push({
name: speakerName
});
}
});

data.sessions.push(session);
});

data.attendees = [];
$('.attendees-placeholder li').each(function(){
var attendee = {},
$this = $(this);

attendee.name = $this.find('img:first').attr('alt');
attendee.twitterHandle = page.getTwitterHandle($this.find('a'));

data.attendees.push(attendee);
});

data.trackers = [];
$('.trackers-placeholder li').each(function(){
var tracker = {},
$this = $(this);

tracker.name = $this.find('img:first').attr('alt');
tracker.twitterHandle = page.getTwitterHandle($this.find('a'));

data.trackers.push(tracker);
});

callback(null, data);
};

module.exports = parse;
3 changes: 3 additions & 0 deletions lib/parsers/index.js
@@ -0,0 +1,3 @@
module.exports = {
'event': require('./event')
};

0 comments on commit ea1e43a

Please sign in to comment.