Skip to content
This repository has been archived by the owner on May 10, 2021. It is now read-only.

Commit

Permalink
Merge pull request #150 from nlnwa/handle_ERR_FAILED
Browse files Browse the repository at this point in the history
Handle net.ERR_FAILED from browser
  • Loading branch information
maeb committed Oct 25, 2019
2 parents 4382d5b + 9c5bc76 commit 23d2a45
Show file tree
Hide file tree
Showing 13 changed files with 101 additions and 32 deletions.
4 changes: 2 additions & 2 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@
<maven.compiler.target>11</maven.compiler.target>
<docker.tag>latest</docker.tag>

<chrome.version>75.0.3738.0</chrome.version>
<chrome.version>78.0.3882.0</chrome.version>

<veidemann.commons.version>0.3.11</veidemann.commons.version>
<veidemann.rethinkdbadapter.version>0.3.18</veidemann.rethinkdbadapter.version>
Expand All @@ -48,7 +48,7 @@
<netty.version>4.1.42.Final</netty.version>

<!-- Versions of external docker containers used in integration tests -->
<browserless.chrome.version>1.5.0-puppeteer-1.14.0</browserless.chrome.version>
<browserless.chrome.version>1.18.1-puppeteer-1.20.0</browserless.chrome.version>
<rethinkdb.version>2.3.6</rethinkdb.version>
<veidemann.cache.version>0.2.2</veidemann.cache.version>
<veidemann.dns.resolver.version>0.2.5</veidemann.dns.resolver.version>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ protected void checkVersion() {
String browserVersion = getVersionNumber(protocolClient.getRemoteVersion());
if (!clientVersion.equals(browserVersion)) {
throw new RuntimeException("Chrome client version and browser version does not match. Client: "
+ clientVersion + ", BrowserClient: " + browserVersion);
+ clientVersion + ", Browser: " + browserVersion);
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ public static void main(String args[]) throws IOException {
+ CHROME_VERSION + "/third_party/blink/renderer/core/inspector/browser_protocol.pdl?format=text";

String jsProtocol = "https://chromium.googlesource.com/v8/v8/+/chromium/"
+ CHROME_VERSION.split("\\.")[2] + "/src/inspector/js_protocol.pdl?format=text";
+ CHROME_VERSION.split("\\.")[2] + "/include/js_protocol.pdl?format=text";

System.out.println("Using protocol definitions from:");
System.out.println(" " + browserProtocol);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@
import no.nb.nna.veidemann.commons.util.ApiTools;
import no.nb.nna.veidemann.commons.util.CollectionNameGenerator;
import no.nb.nna.veidemann.harvester.browsercontroller.BrowserSession;
import org.netpreserve.commons.uri.Uri;
import org.netpreserve.commons.uri.UriConfigs;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

Expand Down Expand Up @@ -132,11 +134,13 @@ public void onCompleted() {
}

private BrowserScript getReplacementScript(BrowserSession session, String uri) {
String normalizedUri = UriConfigs.WHATWG.buildUri(uri).toString();
Label replacementLabel = ApiTools.buildLabel("type", "replacement");
for (ConfigObject script : session.getScripts()) {
if (ApiTools.hasLabel(script.getMeta(), replacementLabel)) {
for (String urlRegexp : script.getBrowserScript().getUrlRegexpList()) {
if (uri.matches(urlRegexp)) {
if (normalizedUri.matches(urlRegexp)) {
LOG.warn("Check script {} {} {} {}", script.getMeta().getName(), normalizedUri, urlRegexp, normalizedUri.matches(urlRegexp));
return script.getBrowserScript();
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,8 @@ public class FrontierClient implements AutoCloseable {

private final Pool<ProxySession> pool;

private final boolean headlessBrowser;

private static final String METRICS_NS = "veidemann";
private static final String METRICS_SUBSYSTEM = "harvester";

Expand Down Expand Up @@ -86,18 +88,19 @@ public class FrontierClient implements AutoCloseable {
.register();

public FrontierClient(BrowserController controller, String host, int port, int maxOpenSessions,
String browserWsEndpoint, String proxyHost, int firstProxyPort) {
String browserWsEndpoint, String proxyHost, int firstProxyPort, boolean headlessBrowser) {
this(controller, ManagedChannelBuilder.forAddress(host, port).usePlaintext(), maxOpenSessions,
browserWsEndpoint, proxyHost, firstProxyPort);
browserWsEndpoint, proxyHost, firstProxyPort, headlessBrowser);
}

/**
* Construct client for accessing RouteGuide server using the existing channel.
*/
public FrontierClient(BrowserController controller, ManagedChannelBuilder<?> channelBuilder, int maxOpenSessions,
String browserWsEndpoint, String proxyHost, int firstProxyPort) {
String browserWsEndpoint, String proxyHost, int firstProxyPort, boolean headlessBrowser) {
LOG.info("Setting up Frontier client");
this.controller = controller;
this.headlessBrowser = headlessBrowser;
ClientTracingInterceptor tracingInterceptor = new ClientTracingInterceptor.Builder(GlobalTracer.get()).build();
channel = channelBuilder.intercept(tracingInterceptor).build();
asyncStub = FrontierGrpc.newStub(channel).withWaitForReady();
Expand Down Expand Up @@ -240,6 +243,7 @@ public ProxySession(int proxyId, String browserWSEndpoint, String proxyHost, int
query = query.add(proxyEntry);
}
query = query.add(new Entry("--ignore-certificate-errors", (String) null));
query = query.add(new Entry("headless", Boolean.toString(headlessBrowser)));
browserWsEndpoint = UriConfigs.WHATWG.builder(ws).parsedQuery(query).build().toString();
browserSessions.inc();
LOG.info("Created session: " + this);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ public Harvester start() {

FrontierClient frontierClient = new FrontierClient(controller, SETTINGS.getFrontierHost(),
SETTINGS.getFrontierPort(), SETTINGS.getMaxOpenSessions(), SETTINGS.getBrowserWSEndpoint(),
SETTINGS.getProxyHost(), SETTINGS.getProxyPort());
SETTINGS.getProxyHost(), SETTINGS.getProxyPort(), SETTINGS.isHeadlessBrowser());

RobotsServiceClient robotsServiceClient = new RobotsServiceClient(SETTINGS.getRobotsTxtEvaluatorHost(), SETTINGS.getRobotsTxtEvaluatorPort());

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,7 @@ public RenderResult render(int proxyId, ChromeDebugProtocolConfig protocolConfig

PageLog.Builder pageLog = PageLog.newBuilder()
.setUri(queuedUri.getUri())
.setJobExecutionId(queuedUri.getJobExecutionId())
.setExecutionId(queuedUri.getExecutionId());
if (session.getUriRequests().getInitialRequest() == null) {
LOG.error("Missing initial request");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -247,7 +247,6 @@ public void loadPage() throws ClientClosedException, SessionClosedException {
}
});

session.network().setExtraHTTPHeaders(ImmutableMap.of(EXECUTION_ID, queuedUri.getExecutionId(), JOB_EXECUTION_ID, queuedUri.getJobExecutionId())).run();
session.page().navigate(queuedUri.getUri()).withReferrer(queuedUri.getReferrer()).withTransitionType("link").run();
} catch (ExecutionException | TimeoutException ex) {
throw new RuntimeException(ex);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@

import java.util.ArrayList;
import java.util.List;
import java.util.StringJoiner;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.locks.Condition;
Expand Down Expand Up @@ -168,6 +169,21 @@ public void cancelRequest(String reason) {
LOG.debug("Canceling closed call");
}
}

@Override
public String toString() {
StringJoiner sj = new StringJoiner(", ", Entry.class.getSimpleName() + "[", "]")
.add("uri='" + uri + "'")
.add("resolved=" + resolved)
.add("fromCache=" + fromCache);
if (crawlLog != null) {
sj.add("status=" + crawlLog.getStatusCode())
.add("referrer=" + crawlLog.getReferrer())
.add("size=" + crawlLog.getSize())
.add("contentType=" + crawlLog.getContentType());
}
return sj.toString();
}
}

public CrawlLogRegistry(final BrowserSession session, final long pageLoadTimeout, final long maxIdleTime) {
Expand Down Expand Up @@ -235,7 +251,7 @@ public void run() {
waitForIdle();
crawlLogsLock.lock();
try {
innerMatchCrawlLogAndRequest(status);
innerMatchCrawlLogAndRequest(status, false);

if (!status.allHandled()) {
checkForFileDownload();
Expand Down Expand Up @@ -313,7 +329,7 @@ private void checkForCachedRequests() {
}
}

innerMatchCrawlLogAndRequest(status);
innerMatchCrawlLogAndRequest(status, false);
}

/**
Expand All @@ -327,15 +343,15 @@ private void checkForFileDownload() {
LOG.debug("Guessing that we are downloading a file. Status: {}", status);
crawlLogs.forEach(c -> {
browserSession.getUriRequests().resolveCurrentUriRequest("1").ifPresent(parent -> {
UriRequest r = UriRequest.create("1",
UriRequest r = UriRequest.create("1", "GET",
c.getCrawlLog().getRequestedUri(), browserSession.queuedUri.getReferrer(), ResourceType.Other,
'R', parent, browserSession.getUriRequests().getPageSpan());
r.setStatusCode(c.getCrawlLog().getStatusCode());
browserSession.getUriRequests().add(r);
}).otherwise(() -> {
// No parent, this is a root request;
if (c.isResponseReceived()) {
UriRequest r = UriRequest.createRoot("1",
UriRequest r = UriRequest.createRoot("1", "GET",
c.uri, browserSession.queuedUri.getReferrer(), ResourceType.Other,
browserSession.queuedUri.getDiscoveryPath(), browserSession.getUriRequests().getPageSpan());
r.setStatusCode(c.getCrawlLog().getStatusCode());
Expand All @@ -344,7 +360,7 @@ private void checkForFileDownload() {
});
});

innerMatchCrawlLogAndRequest(status);
innerMatchCrawlLogAndRequest(status, false);
}
}

Expand All @@ -358,14 +374,14 @@ public boolean waitForMatcherToFinish() {
// Send signal to stop waitForIdle loop
signalRequestsUpdated();
}
innerMatchCrawlLogAndRequest(status);
innerMatchCrawlLogAndRequest(status, true);
return success;
} catch (InterruptedException e) {
LOG.info("Pageload interrupted", e);
finishLatch.countDown();
// Send signal to stop waitForIdle loop
signalRequestsUpdated();
innerMatchCrawlLogAndRequest(status);
innerMatchCrawlLogAndRequest(status, true);
return false;
}
}
Expand Down Expand Up @@ -393,6 +409,10 @@ private boolean uriEquals(String u1, String u2) {
}

private boolean innerFindRequestForCrawlLog(Entry crawlLogEntry, UriRequest r) {
if (crawlLogEntry.isResolved()) {
return true;
}

boolean requestFound = false;
Timestamp now = ProtoUtils.getNowTs();
if (r.getCrawlLog() == null
Expand Down Expand Up @@ -438,6 +458,11 @@ && uriEquals(r.getUrl(), crawlLogEntry.uri)) {
// Update request to match crawllog
r.setStatusCode(crawlLogEntry.getCrawlLog().getStatusCode());
requestFound = true;
} else if (r.getStatusCode() == 0) {
// Browser failed reading response, but the crawl log returned a status which we will keep
LOG.warn("Don't understand a thing, but anyway {} -- {}", crawlLogEntry.uri, "r.getStatusCode() == 0");
r.setStatusCode(crawlLogEntry.getCrawlLog().getStatusCode());
requestFound = true;
} else {
LOG.warn("Unhandled response: Request status: {}, CrawlLog status: {}, URL:{}", r.getStatusCode(), crawlLogEntry.getCrawlLog().getStatusCode(), r.getUrl());
}
Expand Down Expand Up @@ -469,22 +494,31 @@ && uriEquals(r.getUrl(), crawlLogEntry.uri)) {
return requestFound;
}

private void innerMatchCrawlLogAndRequest(MatchStatus status) {
private void innerMatchCrawlLogAndRequest(MatchStatus status, boolean lastInvocation) {
status.reset();
if (!isCrawlLogsResolved()) {
crawlLogs.stream().filter(e -> (!e.isResolved()))
.forEach(e -> findRequestForCrawlLog(e, browserSession.getUriRequests().getInitialRequest()));
if (!isCrawlLogsResolved()) {
LOG.trace("There are still unhandled crawl logs");
if (lastInvocation) {
LOG.error("There are still unhandled crawl logs");
} else {
LOG.trace("There are still unhandled crawl logs");
}
}
}

browserSession.getUriRequests().getRequestStream().forEach(re -> {
if (re.getCrawlLog() == null) {
// Only requests that comes from the origin server should be added to the unhandled requests list
if (!re.isFromCache() && re.isFromProxy() && re.getStatusCode() >= 0) {
LOG.error("Missing CrawlLog for {} {} {} {}, fromCache: {}, fromProxy: {}", re.getRequestId(),
re.getStatusCode(), re.getUrl(), re.getDiscoveryPath(), re.isFromCache(), re.isFromProxy());
if (lastInvocation) {
LOG.error("Missing CrawlLog for {} {} {} {}, fromCache: {}, fromProxy: {}", re.getRequestId(),
re.getStatusCode(), re.getUrl(), re.getDiscoveryPath(), re.isFromCache(), re.isFromProxy());
} else {
LOG.trace("Missing CrawlLog for {} {} {} {}, fromCache: {}, fromProxy: {}", re.getRequestId(),
re.getStatusCode(), re.getUrl(), re.getDiscoveryPath(), re.isFromCache(), re.isFromProxy());
}

status.unhandledRequests.add(re);
}
Expand Down Expand Up @@ -530,4 +564,11 @@ public String toString() {
return sb.toString();
}
}

@Override
public String toString() {
StringJoiner sj = new StringJoiner("\n", CrawlLogRegistry.class.getSimpleName() + "[", "]");
crawlLogs.forEach(e -> sj.add(e.toString()));
return sj.toString();
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,8 @@ public class UriRequest {

private static final Logger LOG = LoggerFactory.getLogger(UriRequest.class);

private String method;

private String url;

private String requestId;
Expand Down Expand Up @@ -93,8 +95,9 @@ public class UriRequest {
private final Condition noReferrer = referrerLock.newCondition();
private final Condition noDiscoveryPath = discoveryPathLock.newCondition();

private UriRequest(String requestId, String url, String referrer, ResourceType type, BaseSpan parentSpan) {
private UriRequest(String requestId, String method, String url, String referrer, ResourceType type, BaseSpan parentSpan) {
this.requestId = requestId;
this.method = method;
this.url = url;
this.referrer = referrer;
this.resourceType = type;
Expand All @@ -104,6 +107,7 @@ private UriRequest(String requestId, String url, String referrer, ResourceType t

private UriRequest(NetworkDomain.RequestWillBeSent request, BaseSpan parentSpan) {
this(request.requestId(),
request.request().method(),
request.request().url(),
(String) request.request().headers().getOrDefault("referer", ""),
ResourceType.forName(request.type()),
Expand All @@ -121,15 +125,15 @@ public static UriRequest createRoot(NetworkDomain.RequestWillBeSent request, Str
return result;
}

public static UriRequest createRoot(String requestId, String url, String referrer, ResourceType type, String initialDiscoveryPath, BaseSpan parentSpan) {
UriRequest result = new UriRequest(requestId, url, referrer, type, parentSpan);
public static UriRequest createRoot(String requestId, String method, String url, String referrer, ResourceType type, String initialDiscoveryPath, BaseSpan parentSpan) {
UriRequest result = new UriRequest(requestId, method, url, referrer, type, parentSpan);
result.discoveryPath = initialDiscoveryPath;
result.rootResource = true;
return result;
}

public static UriRequest create(String requestId, String url, String referrer, ResourceType type, char discoveryType, UriRequest parent, BaseSpan parentSpan) {
UriRequest result = new UriRequest(requestId, url, referrer, type, parentSpan);
public static UriRequest create(String requestId, String method, String url, String referrer, ResourceType type, char discoveryType, UriRequest parent, BaseSpan parentSpan) {
UriRequest result = new UriRequest(requestId, method, url, referrer, type, parentSpan);
result.setParent(parent);
result.discoveryPath = parent.discoveryPath + discoveryType;
return result;
Expand All @@ -153,7 +157,7 @@ public static UriRequest create(NetworkDomain.RequestWillBeSent request, UriRequ
}
}

result = create(request.requestId(), request.request().url(), parent.getUrl(),
result = create(request.requestId(), request.request().method(), request.request().url(), parent.getUrl(),
ResourceType.forName(request.type()), discoveryType, parent, parentSpan);

return result;
Expand Down Expand Up @@ -199,6 +203,10 @@ public void setMimeType(String mimeType) {
.isPresent();
}

public String getMethod() {
return method;
}

public String getUrl() {
return url;
}
Expand Down Expand Up @@ -413,6 +421,7 @@ public String toString(String indent) {
sb.append(", status=").append(statusCode);
sb.append(", path='").append(discoveryPath).append('\'');
sb.append(", renderable=").append(renderable);
sb.append(", method='").append(method).append('\'');
sb.append(", url='").append(url).append('\'');
sb.append(", referrer=").append(referrer);
sb.append(", fromCache=").append(isFromCache());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -175,7 +175,7 @@ void onRequestWillBeSent(NetworkDomain.RequestWillBeSent request) {
}
}).otherwise(() -> {
MDC.put("uri", request.request().url());
LOG.debug("Request will be sent: {}", request.requestId());
LOG.debug("Request will be sent: {} {} {}, priority: {}", request.requestId(), request.request().method(), request.request().url(), request.request().initialPriority());

UriRequest uriRequest;
if (getRootRequest() == null) {
Expand Down Expand Up @@ -226,9 +226,7 @@ void onLoadingFailed(NetworkDomain.LoadingFailed f) {
LOG.debug("Could not create tls tunnel for resource: Error '{}', Blocked reason '{}'", f.errorText(), f.blockedReason());
request.setStatusCode(ExtraStatusCodes.CONNECT_FAILED.getCode());
} else {

LOG.error(
"Failed fetching page: Error '{}', Blocked reason '{}', Resource type: '{}', Canceled: {}, Req: {}, Req Id: {}",
LOG.error("Failed fetching page: Error '{}', Blocked reason '{}', Resource type: '{}', Canceled: {}, Req: {}, Req Id: {}",
f.errorText(), f.blockedReason(), f.type(), f.canceled(), request.getUrl(), f.requestId());
}
} else {
Expand Down

0 comments on commit 23d2a45

Please sign in to comment.