From e6c4df46330ba74edad67eabe0f06ee7f92c562a Mon Sep 17 00:00:00 2001 From: nelsonic Date: Wed, 29 May 2019 09:00:46 +0100 Subject: [PATCH] [WiP] revive issue parser for #103 --- lib/issue.js | 50 ++++++++++++++++++++++++---------------------- lib/scrapers.js | 2 +- lib/switcher.js | 13 ++++++------ test/issue.test.js | 14 ++++++++----- 4 files changed, 43 insertions(+), 36 deletions(-) diff --git a/lib/issue.js b/lib/issue.js index 724d502..c8b81db 100644 --- a/lib/issue.js +++ b/lib/issue.js @@ -1,51 +1,53 @@ /** * profile method scrapes a given GitHub user profile * @param {Object} $ - cheerio object with DOM of page to be scraped - * @param {string} url - a valid GitHub issue url - * @param {function} callback - the callback we should call after scraping + * @param {String} url - a valid GitHub issue url + * @param {Function} callback - the callback we should call after scraping * a callback passed into this method should accept two parameters: - * @param {objectj} error an error object (set to null if no error occurred) - * @param {object} data - the complete issue contents + meta data + * @param {Object} error an error object (set to null if no error occurred) + * @param {Object} data - the complete issue contents + meta data */ module.exports = function issue($, url, callback) { -/* UNCOMMENT THIS IF YOU HAVE TIME/PATIENTCE TO FIX IT...! + var data = { entries : [], labels : [], participants : [] }; data.url = url; - data.title = $('.js-issue-title').first().text().trim(); - data.state = $('.state').first().text().trim(); - console.log(' - - - - - > ' +data.state) + // console.log($('.gh-header-title')); + data.title = $('.gh-header-title').first().text().trim().split('\n')[0]; + + data.state = $('.State').first().text().trim(); data.author = $('.gh-header-meta .author').first().text().trim(); - data.created = $('.gh-header-meta time')[0].attribs.datetime; + data.created = $('relative-time')[0].attribs.datetime; // labels - $('.label').each(function(){ + $('.IssueLabel').each(function(){ data.labels.push($(this).attr('title')); }) + // data.labels.filter((l) => l === true); var milestone = $('.milestone-name') if(milestone.length > 0){ data.milestone = milestone[0].attribs.title; } - var assignee = $('.sidebar-assignee img'); + var assignee = $('.assignee'); if(assignee.length > 0){ - data.assignee = assignee[0].attribs.alt.replace('@', ''); + data.assignee = assignee.text().trim(); } //participants $('.participant-avatar').each(function(){ - data.participants.push($(this).attr('aria-label')); + data.participants.push($(this).attr('href').replace('/','')); }) - + console.log(' - - - - - > data', data) // NOTE: this is possibly the most messed up DOM structure ever! - // its almost as if someone @GitHub is deliberately trying ot prevent crawlers! + // its almost as if someone @GitHub is deliberately trying to prevent crawlers var entries = $('.comment:nth-child(2)'); // yes! its bananas! - for(var i=0; i < entries.length; i++) { - var id = entries[i].attribs.id; // see: http://git.io/vOC5d - var entry = {"id":id}; - entry.author = $('#'+id+' .author').attr('href').replace('/',''); - entry.created = $('#'+id+' time').attr('datetime'); - entry.body = $('#'+id+' .comment-body').first().text().trim(); - data.entries.push(entry); - } + // for(var i=0; i < entries.length; i++) { + // var id = entries[i].attribs.id; // see: http://git.io/vOC5d + // var entry = {"id":id}; + // entry.author = $('#'+id+' .author').attr('href').replace('/',''); + // entry.created = $('#'+id+' time').attr('datetime'); + // entry.body = $('#'+id+' .comment-body').first().text().trim(); + // data.entries.push(entry); + // } return callback(null, data); -*/ + } diff --git a/lib/scrapers.js b/lib/scrapers.js index 6aed8a0..71497da 100644 --- a/lib/scrapers.js +++ b/lib/scrapers.js @@ -1,7 +1,7 @@ module.exports = { // feed: require('./feed'), // activity feed (RSS) followers: require('./followers'), // also scrapes following or stargazers - // issue: require('./issue'), + issue: require('./issue'), // issues: require('./issues'), // issues_search: require('./issues_search'), // labels : require('./labels'), diff --git a/lib/switcher.js b/lib/switcher.js index 3ef4db9..9fcc168 100644 --- a/lib/switcher.js +++ b/lib/switcher.js @@ -76,18 +76,19 @@ module.exports = function switcher (url, callback) { else if(url.match(/people/)) { scraper = 'people'; } - else { - scraper = 'repo'; - } // else if(url.match(/milestones/)) { // scraper = 'milestones'; // } // else if(url.match(/labels/)) { // scraper = 'labels'; // } - // else if($('.issue').length > 0) { - // scraper = 'issue'; - // } + else if($('.issue').length > 0) { + scraper = 'issue'; + } + else { + scraper = 'repo'; + } + // else { // else if(url.match(/issues/)) { // scraper = 'issues'; // } diff --git a/test/issue.test.js b/test/issue.test.js index 881511d..9595234 100644 --- a/test/issue.test.js +++ b/test/issue.test.js @@ -1,7 +1,7 @@ var test = require('tape'); var issue = require('../lib/switcher'); -test.skip('Scrape /dwyl/tudo/issues/51 for all comments and meta-data', function(t){ +test.only('Scrape /dwyl/tudo/issues/51 for comments & meta-data', function (t) { var url = '/dwyl/tudo/issues/51'; issue(url, function(err, data) { t.ok(data.url.indexOf(url) > -1, url + ' is: ' +data.url) @@ -11,12 +11,16 @@ test.skip('Scrape /dwyl/tudo/issues/51 for all comments and meta-data', function t.ok(data.created.length > 0, url + ' was created on: '+data.created); // labels t.ok(data.labels.length > 2, url + ' has '+data.labels.length + ' labels') - t.ok(data.milestone === 'Minimal Usable Product', 'Milestone is: '+data.milestone); + t.ok(data.milestone === 'Minimal Usable Product', 'Milestone is: ' + + data.milestone); t.ok(data.assignee.length > 0, url + ' has assignee: '+ data.assignee); - t.ok(data.participants.length > 2, url + ' has participants: ' + data.participants); - t.ok(data.participants.indexOf('iteles') > -1, url + ' has participation from @iteles'); + t.ok(data.participants.length > 2, url + ' has participants: ' + + data.participants); + t.ok(data.participants.indexOf('iteles') > -1, url + + ' has participation from @iteles'); - t.ok(data.entries.length > 2, url + ' has: '+data.entries.length); + t.ok(data.entries.length > 2, + url + ' has: '+data.entries.length + ' comments'); t.end(); });