Skip to content
This repository has been archived by the owner on May 10, 2021. It is now read-only.

Commit

Permalink
Merge pull request #146 from nlnwa/recorderproxy
Browse files Browse the repository at this point in the history
Handle NPE in crawlLog Request matching
  • Loading branch information
johnerikhalse committed Jul 15, 2019
2 parents 5931123 + 18289cf commit 08d79b3
Show file tree
Hide file tree
Showing 3 changed files with 30 additions and 27 deletions.
4 changes: 2 additions & 2 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@
<chrome.version>75.0.3738.0</chrome.version>

<veidemann.commons.version>0.3.11</veidemann.commons.version>
<veidemann.rethinkdbadapter.version>0.3.17</veidemann.rethinkdbadapter.version>
<veidemann.rethinkdbadapter.version>0.3.18</veidemann.rethinkdbadapter.version>

<log4j.version>2.7</log4j.version>
<org.jwat.version>1.1.1</org.jwat.version>
Expand All @@ -54,7 +54,7 @@
<veidemann.dns.resolver.version>0.2.5</veidemann.dns.resolver.version>
<veidemann.contentwriter.version>0.2.5</veidemann.contentwriter.version>
<veidemann.robotsevaluator.version>0.3.8</veidemann.robotsevaluator.version>
<veidemann.frontier.version>0.3.19</veidemann.frontier.version>
<veidemann.frontier.version>0.3.20</veidemann.frontier.version>
<veidemann.controller.version>0.3.17</veidemann.controller.version>
<veidemann.recorderproxy.version>0.1.2</veidemann.recorderproxy.version>
</properties>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ public FrontierClient(BrowserController controller, ManagedChannelBuilder<?> cha
asyncStub = FrontierGrpc.newStub(channel).withWaitForReady();
pool = new Pool<>(maxOpenSessions - 1, () -> new ProxySession(idx.getAndIncrement(),
browserWsEndpoint, proxyHost, firstProxyPort), null, p -> p.reset());
LOG.error("Frontier client pointing to " + channel.authority());
LOG.info("Frontier client pointing to " + channel.authority());
}

public void requestNextPage() throws InterruptedException {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -399,35 +399,38 @@ private boolean innerFindRequestForCrawlLog(Entry crawlLogEntry, UriRequest r) {
&& !r.isFromCache()
&& uriEquals(r.getUrl(), crawlLogEntry.uri)) {

if (crawlLogEntry.isResponseReceived() && crawlLogEntry.getCrawlLog().getStatusCode() == r.getStatusCode()) {
requestFound = true;
} else if (r.getStatusCode() == ExtraStatusCodes.CANCELED_BY_BROWSER.getCode()) {
if (crawlLogEntry.isResponseReceived()) {
if (crawlLogEntry.isResponseReceived()) {
if (crawlLogEntry.getCrawlLog().getStatusCode() == r.getStatusCode()) {
requestFound = true;
} else if (r.getStatusCode() == ExtraStatusCodes.CANCELED_BY_BROWSER.getCode()) {
r.setStatusCode(crawlLogEntry.getCrawlLog().getStatusCode());
requestFound = true;
} else {
} else if (crawlLogEntry.getCrawlLog().getStatusCode() == ExtraStatusCodes.CANCELED_BY_BROWSER.getCode()) {
r.setStatusCode(crawlLogEntry.getCrawlLog().getStatusCode());
requestFound = true;
} else if (r.getStatusCode() == 504 && crawlLogEntry.getCrawlLog().getStatusCode() == ExtraStatusCodes.HTTP_TIMEOUT.getCode()) {
// If http times out, the proxy will return 504, but proxy sets crawllogstatus to -4 which is the underlying status.
// Update request to match crawllog
r.setStatusCode(crawlLogEntry.getCrawlLog().getStatusCode());
requestFound = true;
} else if (r.getStatusCode() == 0 && crawlLogEntry.getCrawlLog().getStatusCode() == ExtraStatusCodes.CONNECT_FAILED.getCode()) {
// If https connect fails, the proxy will return 0, but proxy sets crawllogstatus to -2 which is the underlying status.
// Update request to match crawllog
r.setStatusCode(crawlLogEntry.getCrawlLog().getStatusCode());
requestFound = true;
} else if (r.getStatusCode() == 403 && crawlLogEntry.getCrawlLog().getStatusCode() == ExtraStatusCodes.PRECLUDED_BY_ROBOTS.getCode()) {
// If request is precluded by robots.txt, the proxy will return 403, but proxy sets crawllogstatus to -9998 which is the underlying status.
// Update request to match crawllog
r.setStatusCode(crawlLogEntry.getCrawlLog().getStatusCode());
requestFound = true;
} else {
LOG.warn("Unhandled response: Request status: {}, CrawlLog status: {}, URL:{}", r.getStatusCode(), crawlLogEntry.getCrawlLog().getStatusCode(), r.getUrl());
}
} else if (r.getStatusCode() == 504 && crawlLogEntry.getCrawlLog().getStatusCode() == ExtraStatusCodes.HTTP_TIMEOUT.getCode()) {
// If http times out, the proxy will return 504, but proxy sets crawllogstatus to -4 which is the underlying status.
// Update request to match crawllog
r.setStatusCode(crawlLogEntry.getCrawlLog().getStatusCode());
requestFound = true;
} else if (r.getStatusCode() == 0 && crawlLogEntry.getCrawlLog().getStatusCode() == ExtraStatusCodes.CONNECT_FAILED.getCode()) {
// If https connect fails, the proxy will return 0, but proxy sets crawllogstatus to -2 which is the underlying status.
// Update request to match crawllog
r.setStatusCode(crawlLogEntry.getCrawlLog().getStatusCode());
requestFound = true;
} else if (r.getStatusCode() == 403 && crawlLogEntry.getCrawlLog().getStatusCode() == ExtraStatusCodes.PRECLUDED_BY_ROBOTS.getCode()) {
// If request is precluded by robots.txt, the proxy will return 403, but proxy sets crawllogstatus to -9998 which is the underlying status.
// Update request to match crawllog
r.setStatusCode(crawlLogEntry.getCrawlLog().getStatusCode());
requestFound = true;
} else {
try {
System.out.println("REQUEST STATUS: " + r.getStatusCode() + ", CL STATUS: " + crawlLogEntry.getCrawlLog().getStatusCode() + " :: " + r.getUrl());
} catch (NullPointerException e) {
System.out.println("NPE REQUEST STATUS: " + r.getStatusCode() + ", CL STATUS: " + crawlLogEntry.isResponseReceived() + " :: " + r.getUrl());
if (r.getStatusCode() == ExtraStatusCodes.CANCELED_BY_BROWSER.getCode()) {
requestFound = true;
} else {
LOG.debug("Response not received (yet?): Request status: {}, URL:{}", r.getStatusCode(), r.getUrl());
}
}
}
Expand Down

0 comments on commit 08d79b3

Please sign in to comment.