Skip to content

Commit

Permalink
Merge pull request #133 from nelsonic/org_repos-#131
Browse files Browse the repository at this point in the history
PR: Org Repos #131
  • Loading branch information
asntc committed Apr 23, 2024
2 parents 1a40cad + 7fd29ad commit bee7655
Show file tree
Hide file tree
Showing 17 changed files with 6,102 additions and 3,125 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Expand Up @@ -29,3 +29,6 @@ node_modules
.vagrant
crawl.js
.DS_Store

.env
tmp/
18 changes: 18 additions & 0 deletions index.js
@@ -0,0 +1,18 @@
require("env2")(".env");
const debug = require("./lambda/debug.js");
const gs = require('github-scraper');

exports.handler = function handler (event, context, callback) {
console.log(event);
console.log("Hi Friends!")
debug(event);
console.log('rawPath:', event.rawPath)

const url = event.rawPath;
gs(url, function(err, data) {
console.log(' - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ')
console.log(data);

return callback(null, data);
});
}
24 changes: 24 additions & 0 deletions lambda/debug.js
@@ -0,0 +1,24 @@
'use strict';
require('env2')('.env');
const save = require('./s3.js').save;

/**
* `debug` is used to debug SNS notification events.
* it only gets executed if the NODE_ENV is set to "test".
* To save event data to S3 you will need to add AWS_S3_BUCKET to .env
* see: github.com/dwyl/aws-ses-lambda/issues/12
* @param {Object} event - the object we want to store on S3
*/
module.exports = function debug (event) {
// console.log("process.env.NODE_ENV:", process.env.NODE_ENV);
if (process.env.NODE_ENV === "test") {
if(event.Records && !event.key) {
event.key = "sns";
}
save(event, function callback (error, data) {
console.log("DEBUG - - - error:", error, " - - - data:");
console.log(data);
console.log(" - - - - - - - - - - - - - - - - - - - - ");
});
}
};
39 changes: 39 additions & 0 deletions lambda/http_request.js
@@ -0,0 +1,39 @@
'use strict';

require("env2")(".env"); // ensure JWT_SECRET environment variable is defined.
const http = require('https'); // ALWAYS use TLS over the internets!
const jwt = require('jsonwebtoken');
/**
* simple_http_request is a bare-bones http request using node.js core http
* see: https://nodejs.org/api/http.html#http_http_request_options_callback
* @param {Object} json - the JSON data we want to send to the Phoenix App.
* @param {Function} callback - a standard callback with error & response args
* response is a JSON Object unless there is an error. No error handling yet ...
*/

module.exports = function simple_http_request (json, callback) {
const options = { // the json data is included in the token! 😮
headers: {
'Authorization': jwt.sign(json, process.env.JWT_SECRET),
'Accept': 'application/json'
},
hostname: process.env.EMAIL_APP_URL, // e.g: phemail.herokuapp.com
method: 'POST', // HTTP post sans body: stackoverflow.com/questions/4191593
port: '443',
path: '/api/sns' // the API endpoint that processes and stores SNS data
}

http.request(options, function (res) {
let resStr = '';
res.setEncoding('utf8');
res.on('data', function (chunk) {
resStr += chunk;
}).on('end', function () {
return callback(res.statusCode, JSON.parse(resStr));
});
})
// .on('error', (e) => {
// console.error(`problem with request: ${e.message}`);
// })
.end();
};
52 changes: 52 additions & 0 deletions lambda/s3.js
@@ -0,0 +1,52 @@
'use strict';
require('env2')('.env');
const AWS = require('aws-sdk');
AWS.config.region = 'eu-west-1';
var s3 = new AWS.S3({params: {Bucket: process.env.AWS_S3_BUCKET}});

/**
* `save` saves a JSON object to S3.
* if you need to specify the file name, use `json.key`
* @param {Object} json - the object we want to store on S3
* @param {Function} callback - called once the file has been uploaded
*/
module.exports.save = function save (json, callback) {
if (json) {
const filename = json.key || 'event'
const params = {
Key: filename + '.json',
Body: JSON.stringify(json),
ContentType: 'application/json',
ACL: 'public-read'
};

s3.upload(params, function (err, data) {
if (callback && typeof callback === "function") {
return callback(err, data);
}
else {
return data;
}
});

} else {
return callback('ERROR: please provide json data');
}
}

/**
* `get` retrieves and parses a JSON file from S3
* this function is only used to test that the `save` method.
* @param {String} key - the filename of the object to get from S3
* @param {Function} callback - called once the file has been uploaded
*/
module.exports.get = function get (key, callback) {
s3.getObject({Key: key}, function (error, data) {
if (error) {
return callback(error);
}
else {
return callback(error, JSON.parse(data.Body.toString()));
}
});
};
17 changes: 17 additions & 0 deletions lib/next_page_beta.js
@@ -0,0 +1,17 @@
/**
* next_page checks for pagination on a "beta" page ref #131
* @param {Object} $ - cheerio object with DOM of page to be scraped
* @param {Object} data - the data we have scraped from the page so far
* @return {Object} the data object with a next_page key & value
*/
module.exports = function next_page_beta ($, data) {
const next = $('.TablePaginationSteps').find('[class^="Pagination__Page-"]').last().attr('href');
data.next_page = '';
/* istanbul ignore else */
if (next) {
const url = data.url.split('?')[0];
data.next_page = url + '?type=all&' + 'page=' + next.replace('#', '');
}

return data;
}
2 changes: 1 addition & 1 deletion lib/org.js
Expand Up @@ -38,7 +38,7 @@ function org($, url, callback) {
name: $(parent + ' a').first().text().trim(),
lang: $(parent + 'span[itemprop=programmingLanguage]').first().text().trim(),
url: $(parent + ' a').first().attr('href'),
description: $(parent + 'p.d-inline-block').first().text().trim(),
description: $(parent + 'p[itemprop=description]').first().text().trim(),
updated: $(parent + ' relative-time')[0].attribs.datetime
});
});
Expand Down
46 changes: 46 additions & 0 deletions lib/org_repos.js
@@ -0,0 +1,46 @@
/**
* `org_repos` parses a given GitHub organization repositories page.
* e.g: https://github.com/orgs/dwyl/repositories?type=all
* @param {object} $ - the cheerio DOM object.
* @param {string} url - the url of the page to be parsed.
* @param {function} callback - the callback we should call after scraping
* a callback passed into this method should accept two parameters:
* @param {objectj} error an error object (set to null if no error occurred)
* @param {object} data - the complete organsiation data
*/
function org_repos($, url, callback) {
var data = { url: url, type: 'org_repos' };
data.name = $('h1.lh-condensed').first().text().trim();
// data.description = $('h1.lh-condensed').parent().next().text().trim(); // yep ...¯\_(ツ)_/¯
data.description = $('.container-xl .color-fg-muted').first().text().trim()
// var people = $('.Counter').eq(1); // people is *second* in list of tabs!
// data.pcount = parseInt(people.first().text(), 10);
// data.pcount = isNaN(data.pcount) ? 0 : data.pcount
data.avatar = $('.avatar')[0].attribs.src;
var parts = data.avatar.split('/');
data.uid = parseInt(parts[parts.length-1].split('?')[0], 10);
// list of repos
var items = $('li.listviewitem');
// console.log('items.length', items.length);
data.entries = []; // avoid having circular reference objects! :-(
items.each( function (i) { // JS counters start at 0.
// console.log(i)
var parent = 'li:nth-child(' + (i+1) +') '; // CSS selectors start at 1.
console.log($(parent))
console.log($(parent + ' .markdown-title'))
data.entries.push({
// feel free to add more attributes to this! 🙏
name: $(parent + ' .markdown-title').text().trim(),
// lang: $(parent + ' .listview-item-main-content').find('[class^="Text-"]').text().trim(),
url: $(parent + ' a').first().attr('href'),
description: $(parent + ' .repos-list-description').first().text().trim(),
// updated: $(parent + ' relative-time')[0].attribs.datetime
});
});
// console.log(data)

data = require('./next_page_beta')($, data); // don't worry this gets cached ;-)
callback(null, data);
}

module.exports = org_repos
10 changes: 5 additions & 5 deletions lib/profile.js
@@ -1,5 +1,5 @@

const selectors=require('../config/repos')
const selectors = require('../config/repos')
/**
* profile method scrapes a given GitHub user profile
* @param {string} username - a valid GitHub username
Expand Down Expand Up @@ -46,11 +46,11 @@ module.exports = function profile ($, url, callback) {
data.website = $('[data-test-selector=profile-website-url] > a').attr("href")
// data.joined = $('.join-date').attr('datetime'); // Joined GitHub

// Contributions to Open Source in the past 12 months
data.contribs = parseInt($('.js-yearly-contributions').text().trim()
.split(' contributions')[0].replace(',', ''), 10);
// Contributions to Open Source in the past 12 months #132
// data.contribs = parseInt($('.js-yearly-contributions h2').text().trim()
// .split(' contributions')[0].replace(',', ''), 10);
// Contribution Matrix
data = require('./profile_contribs.js')($, data);
// data = require('./profile_contribs.js')($, data);

// List of (Public) organizations from profile
// data-hovercard-type="organization"
Expand Down
1 change: 1 addition & 0 deletions lib/scrapers.js
Expand Up @@ -7,6 +7,7 @@ module.exports = {
// labels : require('./labels'),
// milestones : require('./milestones'),
org: require('./org'),
org_repos: require('./org_repos'),
people: require('./people'),
profile: require('./profile'),
repo: require('./repo'),
Expand Down
4 changes: 4 additions & 0 deletions lib/switcher.js
Expand Up @@ -55,6 +55,10 @@ module.exports = function switcher (url, callback) {
console.log('repos_user - - - - - - - - -')
scraper = 'repos_user';
}
// e.g: https://github.com/orgs/dwyl/repositories?type=all
else if(url.match(/org/) && url.match(/repositories/)) {
scraper = 'org_repos';
}
else if(url.match(/followers|following/)) {
scraper = 'followers'; // html/DOM is identical for these 2 pages!
}
Expand Down
3 changes: 0 additions & 3 deletions lib/utils.js
Expand Up @@ -19,9 +19,6 @@ function parse_int (str) {
, 10)
}

/**
* A library of utility functions for parsing web data.
*/
module.exports = {
parse_int: parse_int
}

0 comments on commit bee7655

Please sign in to comment.