From 0ff197501b747a3cee61264b4a089c04a25f83cd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fredrik=20Wollse=CC=81n?= Date: Wed, 14 Aug 2019 12:48:12 +0300 Subject: [PATCH] Allow content saving to be configured by any resource type --- .../Extension/firefox/feature.js/index.js | 6 +- .../src/background/http-instrument.ts | 83 ++++++++++++------- crawler.py | 6 +- 3 files changed, 60 insertions(+), 35 deletions(-) diff --git a/automation/Extension/firefox/feature.js/index.js b/automation/Extension/firefox/feature.js/index.js index dad5f44bb..a03e9d463 100644 --- a/automation/Extension/firefox/feature.js/index.js +++ b/automation/Extension/firefox/feature.js/index.js @@ -22,8 +22,7 @@ async function main() { cookie_instrument:true, js_instrument:true, http_instrument:true, - save_javascript:false, - save_all_content:false, + save_content:false, testing:true, crawl_id:0 }; @@ -56,8 +55,7 @@ async function main() { loggingDB.logDebug("HTTP Instrumentation enabled"); let httpInstrument = new HttpInstrument(loggingDB); httpInstrument.run(config['crawl_id'], - config['save_javascript'], - config['save_all_content']); + config['save_content']); } } diff --git a/automation/Extension/webext-instrumentation/src/background/http-instrument.ts b/automation/Extension/webext-instrumentation/src/background/http-instrument.ts index 39690d07c..03375ab80 100644 --- a/automation/Extension/webext-instrumentation/src/background/http-instrument.ts +++ b/automation/Extension/webext-instrumentation/src/background/http-instrument.ts @@ -15,6 +15,8 @@ import { WebRequestOnCompletedEventDetails, } from "../types/browser-web-request-event-details"; +type SaveContentOption = boolean | string | ResourceType[]; + /** * Note: Different parts of the desired information arrives in different events as per below: * request = headers in onBeforeSendHeaders + body in onBeforeRequest @@ -40,7 +42,7 @@ export class HttpInstrument { this.dataReceiver = dataReceiver; } - public run(crawlID, saveJavascript, saveAllContent) { + public run(crawlID, saveContentOption: SaveContentOption) { const allTypes: ResourceType[] = [ "beacon", "csp_report", @@ -77,7 +79,9 @@ export class HttpInstrument { * Attach handlers to event listeners */ - this.onBeforeRequestListener = details => { + this.onBeforeRequestListener = ( + details: WebRequestOnBeforeRequestEventDetails, + ) => { const blockingResponseThatDoesNothing: BlockingResponse = {}; // Ignore requests made by extensions if (requestStemsFromExtension(details)) { @@ -87,9 +91,7 @@ export class HttpInstrument { pendingRequest.resolveOnBeforeRequestEventDetails(details); const pendingResponse = this.getPendingResponse(details.requestId); pendingResponse.resolveOnBeforeRequestEventDetails(details); - if (saveAllContent) { - pendingResponse.addResponseResponseBodyListener(details); - } else if (saveJavascript && this.isJS(details.type)) { + if (this.shouldSaveContent(saveContentOption, details.type)) { pendingResponse.addResponseResponseBodyListener(details); } return blockingResponseThatDoesNothing; @@ -97,7 +99,7 @@ export class HttpInstrument { browser.webRequest.onBeforeRequest.addListener( this.onBeforeRequestListener, filter, - saveJavascript || saveAllContent + this.shouldPossiblýSaveContent(saveContentOption) ? ["requestBody", "blocking"] : ["requestBody"], ); @@ -145,8 +147,7 @@ export class HttpInstrument { details, crawlID, incrementedEventOrdinal(), - saveJavascript, - saveAllContent, + saveContentOption, ); }; browser.webRequest.onCompleted.addListener( @@ -177,6 +178,49 @@ export class HttpInstrument { } } + private shouldPossiblýSaveContent(saveContentOption: SaveContentOption) { + if (saveContentOption === true) { + return true; + } + if (saveContentOption === false) { + return false; + } + if (this.saveContentResourceTypes(saveContentOption).length > 0) { + return true; + } + return false; + } + + private saveContentResourceTypes( + saveContentOption: string | ResourceType[], + ): ResourceType[] { + return typeof saveContentOption === "string" + ? (saveContentOption.split(",") as ResourceType[]) + : saveContentOption; + } + + /** + * We rely on the resource type to filter responses + * See: https://developer.mozilla.org/en-US/docs/Mozilla/Add-ons/WebExtensions/API/webRequest/ResourceType + * + * @param saveContentOption + * @param resourceType + */ + private shouldSaveContent( + saveContentOption: SaveContentOption, + resourceType: ResourceType, + ) { + if (saveContentOption === true) { + return true; + } + if (saveContentOption === false) { + return false; + } + return this.saveContentResourceTypes(saveContentOption).includes( + resourceType, + ); + } + private getPendingRequest(requestId): PendingRequest { if (!this.pendingRequests[requestId]) { this.pendingRequests[requestId] = new PendingRequest(); @@ -582,34 +626,19 @@ export class HttpInstrument { } } - /** - * Return true if this request is loading javascript - * We rely mostly on the content policy type to filter responses - * and fall back to the URI and content type string for types that can - * load various resource types. - * See: https://developer.mozilla.org/en-US/docs/Mozilla/Add-ons/WebExtensions/API/webRequest/ResourceType - * - * @param resourceType - */ - private isJS(resourceType: ResourceType): boolean { - return resourceType === "script"; - } - // Instrument HTTP responses private async onCompletedHandler( details: WebRequestOnCompletedEventDetails, crawlID, eventOrdinal, - saveJavascript, - saveAllContent, + saveContent, ) { /* console.log( "onCompletedHandler (previously httpRequestHandler)", details, crawlID, - saveJavascript, - saveAllContent, + saveContent, ); */ @@ -676,9 +705,7 @@ export class HttpInstrument { update.headers = JSON.stringify(headers); update.location = escapeString(location); - if (saveAllContent) { - this.logWithResponseBody(details, update); - } else if (saveJavascript && this.isJS(details.type)) { + if (this.shouldSaveContent(saveContent, details.type)) { this.logWithResponseBody(details, update); } else { this.dataReceiver.saveRecord("http_responses", update); diff --git a/crawler.py b/crawler.py index c1c0eb007..25cef8230 100644 --- a/crawler.py +++ b/crawler.py @@ -21,7 +21,7 @@ COOKIE_INSTRUMENT = os.getenv('COOKIE_INSTRUMENT', '1') == '1' NAVIGATION_INSTRUMENT = os.getenv('NAVIGATION_INSTRUMENT', '1') == '1' JS_INSTRUMENT = os.getenv('JS_INSTRUMENT', '1') == '1' -SAVE_JAVASCRIPT = os.getenv('SAVE_JAVASCRIPT', '0') == '1' +SAVE_CONTENT = os.getenv('SAVE_CONTENT', '') DWELL_TIME = int(os.getenv('DWELL_TIME', '10')) TIMEOUT = int(os.getenv('TIMEOUT', '60')) SENTRY_DSN = os.getenv('SENTRY_DSN', None) @@ -36,7 +36,7 @@ browser_params[i]['cookie_instrument'] = COOKIE_INSTRUMENT browser_params[i]['navigation_instrument'] = NAVIGATION_INSTRUMENT browser_params[i]['js_instrument'] = JS_INSTRUMENT - browser_params[i]['save_javascript'] = SAVE_JAVASCRIPT + browser_params[i]['save_content'] = SAVE_CONTENT browser_params[i]['headless'] = True # Manager configuration @@ -69,7 +69,7 @@ scope.set_tag('COOKIE_INSTRUMENT', COOKIE_INSTRUMENT) scope.set_tag('NAVIGATION_INSTRUMENT', NAVIGATION_INSTRUMENT) scope.set_tag('JS_INSTRUMENT', JS_INSTRUMENT) - scope.set_tag('SAVE_JAVASCRIPT', SAVE_JAVASCRIPT) + scope.set_tag('SAVE_CONTENT', SAVE_CONTENT) scope.set_tag('DWELL_TIME', DWELL_TIME) scope.set_tag('TIMEOUT', TIMEOUT) scope.set_tag('CRAWL_REFERENCE', '%s/%s' %