Skip to content

Commit

Permalink
Allow content saving to be configured by any resource type
Browse files Browse the repository at this point in the history
  • Loading branch information
motin committed Aug 14, 2019
1 parent 2c2c695 commit 0ff1975
Show file tree
Hide file tree
Showing 3 changed files with 60 additions and 35 deletions.
6 changes: 2 additions & 4 deletions automation/Extension/firefox/feature.js/index.js
Expand Up @@ -22,8 +22,7 @@ async function main() {
cookie_instrument:true,
js_instrument:true,
http_instrument:true,
save_javascript:false,
save_all_content:false,
save_content:false,
testing:true,
crawl_id:0
};
Expand Down Expand Up @@ -56,8 +55,7 @@ async function main() {
loggingDB.logDebug("HTTP Instrumentation enabled");
let httpInstrument = new HttpInstrument(loggingDB);
httpInstrument.run(config['crawl_id'],
config['save_javascript'],
config['save_all_content']);
config['save_content']);
}
}

Expand Down
Expand Up @@ -15,6 +15,8 @@ import {
WebRequestOnCompletedEventDetails,
} from "../types/browser-web-request-event-details";

type SaveContentOption = boolean | string | ResourceType[];

/**
* Note: Different parts of the desired information arrives in different events as per below:
* request = headers in onBeforeSendHeaders + body in onBeforeRequest
Expand All @@ -40,7 +42,7 @@ export class HttpInstrument {
this.dataReceiver = dataReceiver;
}

public run(crawlID, saveJavascript, saveAllContent) {
public run(crawlID, saveContentOption: SaveContentOption) {
const allTypes: ResourceType[] = [
"beacon",
"csp_report",
Expand Down Expand Up @@ -77,7 +79,9 @@ export class HttpInstrument {
* Attach handlers to event listeners
*/

this.onBeforeRequestListener = details => {
this.onBeforeRequestListener = (
details: WebRequestOnBeforeRequestEventDetails,
) => {
const blockingResponseThatDoesNothing: BlockingResponse = {};
// Ignore requests made by extensions
if (requestStemsFromExtension(details)) {
Expand All @@ -87,17 +91,15 @@ export class HttpInstrument {
pendingRequest.resolveOnBeforeRequestEventDetails(details);
const pendingResponse = this.getPendingResponse(details.requestId);
pendingResponse.resolveOnBeforeRequestEventDetails(details);
if (saveAllContent) {
pendingResponse.addResponseResponseBodyListener(details);
} else if (saveJavascript && this.isJS(details.type)) {
if (this.shouldSaveContent(saveContentOption, details.type)) {
pendingResponse.addResponseResponseBodyListener(details);
}
return blockingResponseThatDoesNothing;
};
browser.webRequest.onBeforeRequest.addListener(
this.onBeforeRequestListener,
filter,
saveJavascript || saveAllContent
this.shouldPossiblýSaveContent(saveContentOption)
? ["requestBody", "blocking"]
: ["requestBody"],
);
Expand Down Expand Up @@ -145,8 +147,7 @@ export class HttpInstrument {
details,
crawlID,
incrementedEventOrdinal(),
saveJavascript,
saveAllContent,
saveContentOption,
);
};
browser.webRequest.onCompleted.addListener(
Expand Down Expand Up @@ -177,6 +178,49 @@ export class HttpInstrument {
}
}

private shouldPossiblýSaveContent(saveContentOption: SaveContentOption) {
if (saveContentOption === true) {
return true;
}
if (saveContentOption === false) {
return false;
}
if (this.saveContentResourceTypes(saveContentOption).length > 0) {
return true;
}
return false;
}

private saveContentResourceTypes(
saveContentOption: string | ResourceType[],
): ResourceType[] {
return typeof saveContentOption === "string"
? (saveContentOption.split(",") as ResourceType[])
: saveContentOption;
}

/**
* We rely on the resource type to filter responses
* See: https://developer.mozilla.org/en-US/docs/Mozilla/Add-ons/WebExtensions/API/webRequest/ResourceType
*
* @param saveContentOption
* @param resourceType
*/
private shouldSaveContent(
saveContentOption: SaveContentOption,
resourceType: ResourceType,
) {
if (saveContentOption === true) {
return true;
}
if (saveContentOption === false) {
return false;
}
return this.saveContentResourceTypes(saveContentOption).includes(
resourceType,
);
}

private getPendingRequest(requestId): PendingRequest {
if (!this.pendingRequests[requestId]) {
this.pendingRequests[requestId] = new PendingRequest();
Expand Down Expand Up @@ -582,34 +626,19 @@ export class HttpInstrument {
}
}

/**
* Return true if this request is loading javascript
* We rely mostly on the content policy type to filter responses
* and fall back to the URI and content type string for types that can
* load various resource types.
* See: https://developer.mozilla.org/en-US/docs/Mozilla/Add-ons/WebExtensions/API/webRequest/ResourceType
*
* @param resourceType
*/
private isJS(resourceType: ResourceType): boolean {
return resourceType === "script";
}

// Instrument HTTP responses
private async onCompletedHandler(
details: WebRequestOnCompletedEventDetails,
crawlID,
eventOrdinal,
saveJavascript,
saveAllContent,
saveContent,
) {
/*
console.log(
"onCompletedHandler (previously httpRequestHandler)",
details,
crawlID,
saveJavascript,
saveAllContent,
saveContent,
);
*/

Expand Down Expand Up @@ -676,9 +705,7 @@ export class HttpInstrument {
update.headers = JSON.stringify(headers);
update.location = escapeString(location);

if (saveAllContent) {
this.logWithResponseBody(details, update);
} else if (saveJavascript && this.isJS(details.type)) {
if (this.shouldSaveContent(saveContent, details.type)) {
this.logWithResponseBody(details, update);
} else {
this.dataReceiver.saveRecord("http_responses", update);
Expand Down
6 changes: 3 additions & 3 deletions crawler.py
Expand Up @@ -21,7 +21,7 @@
COOKIE_INSTRUMENT = os.getenv('COOKIE_INSTRUMENT', '1') == '1'
NAVIGATION_INSTRUMENT = os.getenv('NAVIGATION_INSTRUMENT', '1') == '1'
JS_INSTRUMENT = os.getenv('JS_INSTRUMENT', '1') == '1'
SAVE_JAVASCRIPT = os.getenv('SAVE_JAVASCRIPT', '0') == '1'
SAVE_CONTENT = os.getenv('SAVE_CONTENT', '')
DWELL_TIME = int(os.getenv('DWELL_TIME', '10'))
TIMEOUT = int(os.getenv('TIMEOUT', '60'))
SENTRY_DSN = os.getenv('SENTRY_DSN', None)
Expand All @@ -36,7 +36,7 @@
browser_params[i]['cookie_instrument'] = COOKIE_INSTRUMENT
browser_params[i]['navigation_instrument'] = NAVIGATION_INSTRUMENT
browser_params[i]['js_instrument'] = JS_INSTRUMENT
browser_params[i]['save_javascript'] = SAVE_JAVASCRIPT
browser_params[i]['save_content'] = SAVE_CONTENT
browser_params[i]['headless'] = True

# Manager configuration
Expand Down Expand Up @@ -69,7 +69,7 @@
scope.set_tag('COOKIE_INSTRUMENT', COOKIE_INSTRUMENT)
scope.set_tag('NAVIGATION_INSTRUMENT', NAVIGATION_INSTRUMENT)
scope.set_tag('JS_INSTRUMENT', JS_INSTRUMENT)
scope.set_tag('SAVE_JAVASCRIPT', SAVE_JAVASCRIPT)
scope.set_tag('SAVE_CONTENT', SAVE_CONTENT)
scope.set_tag('DWELL_TIME', DWELL_TIME)
scope.set_tag('TIMEOUT', TIMEOUT)
scope.set_tag('CRAWL_REFERENCE', '%s/%s' %
Expand Down

0 comments on commit 0ff1975

Please sign in to comment.