diff --git a/pom.xml b/pom.xml index 1efa36b5..9e940e1d 100644 --- a/pom.xml +++ b/pom.xml @@ -33,7 +33,7 @@ 11 latest - 75.0.3738.0 + 78.0.3882.0 0.3.11 0.3.18 @@ -48,7 +48,7 @@ 4.1.42.Final - 1.5.0-puppeteer-1.14.0 + 1.18.1-puppeteer-1.20.0 2.3.6 0.2.2 0.2.5 diff --git a/veidemann-chrome-client/src/main/java/no/nb/nna/veidemann/chrome/client/BrowserClientBase.java b/veidemann-chrome-client/src/main/java/no/nb/nna/veidemann/chrome/client/BrowserClientBase.java index e6189f25..51351661 100644 --- a/veidemann-chrome-client/src/main/java/no/nb/nna/veidemann/chrome/client/BrowserClientBase.java +++ b/veidemann-chrome-client/src/main/java/no/nb/nna/veidemann/chrome/client/BrowserClientBase.java @@ -72,7 +72,7 @@ protected void checkVersion() { String browserVersion = getVersionNumber(protocolClient.getRemoteVersion()); if (!clientVersion.equals(browserVersion)) { throw new RuntimeException("Chrome client version and browser version does not match. Client: " - + clientVersion + ", BrowserClient: " + browserVersion); + + clientVersion + ", Browser: " + browserVersion); } } diff --git a/veidemann-chrome-client/src/main/java/no/nb/nna/veidemann/chrome/codegen/Codegen.java b/veidemann-chrome-client/src/main/java/no/nb/nna/veidemann/chrome/codegen/Codegen.java index e5c60160..ba93e7db 100644 --- a/veidemann-chrome-client/src/main/java/no/nb/nna/veidemann/chrome/codegen/Codegen.java +++ b/veidemann-chrome-client/src/main/java/no/nb/nna/veidemann/chrome/codegen/Codegen.java @@ -54,7 +54,7 @@ public static void main(String args[]) throws IOException { + CHROME_VERSION + "/third_party/blink/renderer/core/inspector/browser_protocol.pdl?format=text"; String jsProtocol = "https://chromium.googlesource.com/v8/v8/+/chromium/" - + CHROME_VERSION.split("\\.")[2] + "/src/inspector/js_protocol.pdl?format=text"; + + CHROME_VERSION.split("\\.")[2] + "/include/js_protocol.pdl?format=text"; System.out.println("Using protocol definitions from:"); System.out.println(" " + browserProtocol); diff --git a/veidemann-harvester/src/main/java/no/nb/nna/veidemann/harvester/BrowserControllerService.java b/veidemann-harvester/src/main/java/no/nb/nna/veidemann/harvester/BrowserControllerService.java index 6794692e..048ffc75 100644 --- a/veidemann-harvester/src/main/java/no/nb/nna/veidemann/harvester/BrowserControllerService.java +++ b/veidemann-harvester/src/main/java/no/nb/nna/veidemann/harvester/BrowserControllerService.java @@ -38,6 +38,8 @@ import no.nb.nna.veidemann.commons.util.ApiTools; import no.nb.nna.veidemann.commons.util.CollectionNameGenerator; import no.nb.nna.veidemann.harvester.browsercontroller.BrowserSession; +import org.netpreserve.commons.uri.Uri; +import org.netpreserve.commons.uri.UriConfigs; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -132,11 +134,13 @@ public void onCompleted() { } private BrowserScript getReplacementScript(BrowserSession session, String uri) { + String normalizedUri = UriConfigs.WHATWG.buildUri(uri).toString(); Label replacementLabel = ApiTools.buildLabel("type", "replacement"); for (ConfigObject script : session.getScripts()) { if (ApiTools.hasLabel(script.getMeta(), replacementLabel)) { for (String urlRegexp : script.getBrowserScript().getUrlRegexpList()) { - if (uri.matches(urlRegexp)) { + if (normalizedUri.matches(urlRegexp)) { + LOG.warn("Check script {} {} {} {}", script.getMeta().getName(), normalizedUri, urlRegexp, normalizedUri.matches(urlRegexp)); return script.getBrowserScript(); } } diff --git a/veidemann-harvester/src/main/java/no/nb/nna/veidemann/harvester/FrontierClient.java b/veidemann-harvester/src/main/java/no/nb/nna/veidemann/harvester/FrontierClient.java index 399479e4..5dc9f077 100644 --- a/veidemann-harvester/src/main/java/no/nb/nna/veidemann/harvester/FrontierClient.java +++ b/veidemann-harvester/src/main/java/no/nb/nna/veidemann/harvester/FrontierClient.java @@ -45,6 +45,8 @@ public class FrontierClient implements AutoCloseable { private final Pool pool; + private final boolean headlessBrowser; + private static final String METRICS_NS = "veidemann"; private static final String METRICS_SUBSYSTEM = "harvester"; @@ -86,18 +88,19 @@ public class FrontierClient implements AutoCloseable { .register(); public FrontierClient(BrowserController controller, String host, int port, int maxOpenSessions, - String browserWsEndpoint, String proxyHost, int firstProxyPort) { + String browserWsEndpoint, String proxyHost, int firstProxyPort, boolean headlessBrowser) { this(controller, ManagedChannelBuilder.forAddress(host, port).usePlaintext(), maxOpenSessions, - browserWsEndpoint, proxyHost, firstProxyPort); + browserWsEndpoint, proxyHost, firstProxyPort, headlessBrowser); } /** * Construct client for accessing RouteGuide server using the existing channel. */ public FrontierClient(BrowserController controller, ManagedChannelBuilder channelBuilder, int maxOpenSessions, - String browserWsEndpoint, String proxyHost, int firstProxyPort) { + String browserWsEndpoint, String proxyHost, int firstProxyPort, boolean headlessBrowser) { LOG.info("Setting up Frontier client"); this.controller = controller; + this.headlessBrowser = headlessBrowser; ClientTracingInterceptor tracingInterceptor = new ClientTracingInterceptor.Builder(GlobalTracer.get()).build(); channel = channelBuilder.intercept(tracingInterceptor).build(); asyncStub = FrontierGrpc.newStub(channel).withWaitForReady(); @@ -240,6 +243,7 @@ public ProxySession(int proxyId, String browserWSEndpoint, String proxyHost, int query = query.add(proxyEntry); } query = query.add(new Entry("--ignore-certificate-errors", (String) null)); + query = query.add(new Entry("headless", Boolean.toString(headlessBrowser))); browserWsEndpoint = UriConfigs.WHATWG.builder(ws).parsedQuery(query).build().toString(); browserSessions.inc(); LOG.info("Created session: " + this); diff --git a/veidemann-harvester/src/main/java/no/nb/nna/veidemann/harvester/Harvester.java b/veidemann-harvester/src/main/java/no/nb/nna/veidemann/harvester/Harvester.java index 5656de55..86385ab0 100644 --- a/veidemann-harvester/src/main/java/no/nb/nna/veidemann/harvester/Harvester.java +++ b/veidemann-harvester/src/main/java/no/nb/nna/veidemann/harvester/Harvester.java @@ -84,7 +84,7 @@ public Harvester start() { FrontierClient frontierClient = new FrontierClient(controller, SETTINGS.getFrontierHost(), SETTINGS.getFrontierPort(), SETTINGS.getMaxOpenSessions(), SETTINGS.getBrowserWSEndpoint(), - SETTINGS.getProxyHost(), SETTINGS.getProxyPort()); + SETTINGS.getProxyHost(), SETTINGS.getProxyPort(), SETTINGS.isHeadlessBrowser()); RobotsServiceClient robotsServiceClient = new RobotsServiceClient(SETTINGS.getRobotsTxtEvaluatorHost(), SETTINGS.getRobotsTxtEvaluatorPort()); diff --git a/veidemann-harvester/src/main/java/no/nb/nna/veidemann/harvester/browsercontroller/BrowserController.java b/veidemann-harvester/src/main/java/no/nb/nna/veidemann/harvester/browsercontroller/BrowserController.java index ae0bfce9..1bcc9f8e 100644 --- a/veidemann-harvester/src/main/java/no/nb/nna/veidemann/harvester/browsercontroller/BrowserController.java +++ b/veidemann-harvester/src/main/java/no/nb/nna/veidemann/harvester/browsercontroller/BrowserController.java @@ -153,6 +153,7 @@ public RenderResult render(int proxyId, ChromeDebugProtocolConfig protocolConfig PageLog.Builder pageLog = PageLog.newBuilder() .setUri(queuedUri.getUri()) + .setJobExecutionId(queuedUri.getJobExecutionId()) .setExecutionId(queuedUri.getExecutionId()); if (session.getUriRequests().getInitialRequest() == null) { LOG.error("Missing initial request"); diff --git a/veidemann-harvester/src/main/java/no/nb/nna/veidemann/harvester/browsercontroller/BrowserSession.java b/veidemann-harvester/src/main/java/no/nb/nna/veidemann/harvester/browsercontroller/BrowserSession.java index 8458e61a..afefde39 100644 --- a/veidemann-harvester/src/main/java/no/nb/nna/veidemann/harvester/browsercontroller/BrowserSession.java +++ b/veidemann-harvester/src/main/java/no/nb/nna/veidemann/harvester/browsercontroller/BrowserSession.java @@ -247,7 +247,6 @@ public void loadPage() throws ClientClosedException, SessionClosedException { } }); - session.network().setExtraHTTPHeaders(ImmutableMap.of(EXECUTION_ID, queuedUri.getExecutionId(), JOB_EXECUTION_ID, queuedUri.getJobExecutionId())).run(); session.page().navigate(queuedUri.getUri()).withReferrer(queuedUri.getReferrer()).withTransitionType("link").run(); } catch (ExecutionException | TimeoutException ex) { throw new RuntimeException(ex); diff --git a/veidemann-harvester/src/main/java/no/nb/nna/veidemann/harvester/browsercontroller/CrawlLogRegistry.java b/veidemann-harvester/src/main/java/no/nb/nna/veidemann/harvester/browsercontroller/CrawlLogRegistry.java index df8f9cf1..fcc6ba71 100644 --- a/veidemann-harvester/src/main/java/no/nb/nna/veidemann/harvester/browsercontroller/CrawlLogRegistry.java +++ b/veidemann-harvester/src/main/java/no/nb/nna/veidemann/harvester/browsercontroller/CrawlLogRegistry.java @@ -37,6 +37,7 @@ import java.util.ArrayList; import java.util.List; +import java.util.StringJoiner; import java.util.concurrent.CountDownLatch; import java.util.concurrent.TimeUnit; import java.util.concurrent.locks.Condition; @@ -168,6 +169,21 @@ public void cancelRequest(String reason) { LOG.debug("Canceling closed call"); } } + + @Override + public String toString() { + StringJoiner sj = new StringJoiner(", ", Entry.class.getSimpleName() + "[", "]") + .add("uri='" + uri + "'") + .add("resolved=" + resolved) + .add("fromCache=" + fromCache); + if (crawlLog != null) { + sj.add("status=" + crawlLog.getStatusCode()) + .add("referrer=" + crawlLog.getReferrer()) + .add("size=" + crawlLog.getSize()) + .add("contentType=" + crawlLog.getContentType()); + } + return sj.toString(); + } } public CrawlLogRegistry(final BrowserSession session, final long pageLoadTimeout, final long maxIdleTime) { @@ -235,7 +251,7 @@ public void run() { waitForIdle(); crawlLogsLock.lock(); try { - innerMatchCrawlLogAndRequest(status); + innerMatchCrawlLogAndRequest(status, false); if (!status.allHandled()) { checkForFileDownload(); @@ -313,7 +329,7 @@ private void checkForCachedRequests() { } } - innerMatchCrawlLogAndRequest(status); + innerMatchCrawlLogAndRequest(status, false); } /** @@ -327,7 +343,7 @@ private void checkForFileDownload() { LOG.debug("Guessing that we are downloading a file. Status: {}", status); crawlLogs.forEach(c -> { browserSession.getUriRequests().resolveCurrentUriRequest("1").ifPresent(parent -> { - UriRequest r = UriRequest.create("1", + UriRequest r = UriRequest.create("1", "GET", c.getCrawlLog().getRequestedUri(), browserSession.queuedUri.getReferrer(), ResourceType.Other, 'R', parent, browserSession.getUriRequests().getPageSpan()); r.setStatusCode(c.getCrawlLog().getStatusCode()); @@ -335,7 +351,7 @@ private void checkForFileDownload() { }).otherwise(() -> { // No parent, this is a root request; if (c.isResponseReceived()) { - UriRequest r = UriRequest.createRoot("1", + UriRequest r = UriRequest.createRoot("1", "GET", c.uri, browserSession.queuedUri.getReferrer(), ResourceType.Other, browserSession.queuedUri.getDiscoveryPath(), browserSession.getUriRequests().getPageSpan()); r.setStatusCode(c.getCrawlLog().getStatusCode()); @@ -344,7 +360,7 @@ private void checkForFileDownload() { }); }); - innerMatchCrawlLogAndRequest(status); + innerMatchCrawlLogAndRequest(status, false); } } @@ -358,14 +374,14 @@ public boolean waitForMatcherToFinish() { // Send signal to stop waitForIdle loop signalRequestsUpdated(); } - innerMatchCrawlLogAndRequest(status); + innerMatchCrawlLogAndRequest(status, true); return success; } catch (InterruptedException e) { LOG.info("Pageload interrupted", e); finishLatch.countDown(); // Send signal to stop waitForIdle loop signalRequestsUpdated(); - innerMatchCrawlLogAndRequest(status); + innerMatchCrawlLogAndRequest(status, true); return false; } } @@ -393,6 +409,10 @@ private boolean uriEquals(String u1, String u2) { } private boolean innerFindRequestForCrawlLog(Entry crawlLogEntry, UriRequest r) { + if (crawlLogEntry.isResolved()) { + return true; + } + boolean requestFound = false; Timestamp now = ProtoUtils.getNowTs(); if (r.getCrawlLog() == null @@ -438,6 +458,11 @@ && uriEquals(r.getUrl(), crawlLogEntry.uri)) { // Update request to match crawllog r.setStatusCode(crawlLogEntry.getCrawlLog().getStatusCode()); requestFound = true; + } else if (r.getStatusCode() == 0) { + // Browser failed reading response, but the crawl log returned a status which we will keep + LOG.warn("Don't understand a thing, but anyway {} -- {}", crawlLogEntry.uri, "r.getStatusCode() == 0"); + r.setStatusCode(crawlLogEntry.getCrawlLog().getStatusCode()); + requestFound = true; } else { LOG.warn("Unhandled response: Request status: {}, CrawlLog status: {}, URL:{}", r.getStatusCode(), crawlLogEntry.getCrawlLog().getStatusCode(), r.getUrl()); } @@ -469,13 +494,17 @@ && uriEquals(r.getUrl(), crawlLogEntry.uri)) { return requestFound; } - private void innerMatchCrawlLogAndRequest(MatchStatus status) { + private void innerMatchCrawlLogAndRequest(MatchStatus status, boolean lastInvocation) { status.reset(); if (!isCrawlLogsResolved()) { crawlLogs.stream().filter(e -> (!e.isResolved())) .forEach(e -> findRequestForCrawlLog(e, browserSession.getUriRequests().getInitialRequest())); if (!isCrawlLogsResolved()) { - LOG.trace("There are still unhandled crawl logs"); + if (lastInvocation) { + LOG.error("There are still unhandled crawl logs"); + } else { + LOG.trace("There are still unhandled crawl logs"); + } } } @@ -483,8 +512,13 @@ private void innerMatchCrawlLogAndRequest(MatchStatus status) { if (re.getCrawlLog() == null) { // Only requests that comes from the origin server should be added to the unhandled requests list if (!re.isFromCache() && re.isFromProxy() && re.getStatusCode() >= 0) { - LOG.error("Missing CrawlLog for {} {} {} {}, fromCache: {}, fromProxy: {}", re.getRequestId(), - re.getStatusCode(), re.getUrl(), re.getDiscoveryPath(), re.isFromCache(), re.isFromProxy()); + if (lastInvocation) { + LOG.error("Missing CrawlLog for {} {} {} {}, fromCache: {}, fromProxy: {}", re.getRequestId(), + re.getStatusCode(), re.getUrl(), re.getDiscoveryPath(), re.isFromCache(), re.isFromProxy()); + } else { + LOG.trace("Missing CrawlLog for {} {} {} {}, fromCache: {}, fromProxy: {}", re.getRequestId(), + re.getStatusCode(), re.getUrl(), re.getDiscoveryPath(), re.isFromCache(), re.isFromProxy()); + } status.unhandledRequests.add(re); } @@ -530,4 +564,11 @@ public String toString() { return sb.toString(); } } + + @Override + public String toString() { + StringJoiner sj = new StringJoiner("\n", CrawlLogRegistry.class.getSimpleName() + "[", "]"); + crawlLogs.forEach(e -> sj.add(e.toString())); + return sj.toString(); + } } diff --git a/veidemann-harvester/src/main/java/no/nb/nna/veidemann/harvester/browsercontroller/UriRequest.java b/veidemann-harvester/src/main/java/no/nb/nna/veidemann/harvester/browsercontroller/UriRequest.java index a86e3c9d..c7a46199 100644 --- a/veidemann-harvester/src/main/java/no/nb/nna/veidemann/harvester/browsercontroller/UriRequest.java +++ b/veidemann-harvester/src/main/java/no/nb/nna/veidemann/harvester/browsercontroller/UriRequest.java @@ -40,6 +40,8 @@ public class UriRequest { private static final Logger LOG = LoggerFactory.getLogger(UriRequest.class); + private String method; + private String url; private String requestId; @@ -93,8 +95,9 @@ public class UriRequest { private final Condition noReferrer = referrerLock.newCondition(); private final Condition noDiscoveryPath = discoveryPathLock.newCondition(); - private UriRequest(String requestId, String url, String referrer, ResourceType type, BaseSpan parentSpan) { + private UriRequest(String requestId, String method, String url, String referrer, ResourceType type, BaseSpan parentSpan) { this.requestId = requestId; + this.method = method; this.url = url; this.referrer = referrer; this.resourceType = type; @@ -104,6 +107,7 @@ private UriRequest(String requestId, String url, String referrer, ResourceType t private UriRequest(NetworkDomain.RequestWillBeSent request, BaseSpan parentSpan) { this(request.requestId(), + request.request().method(), request.request().url(), (String) request.request().headers().getOrDefault("referer", ""), ResourceType.forName(request.type()), @@ -121,15 +125,15 @@ public static UriRequest createRoot(NetworkDomain.RequestWillBeSent request, Str return result; } - public static UriRequest createRoot(String requestId, String url, String referrer, ResourceType type, String initialDiscoveryPath, BaseSpan parentSpan) { - UriRequest result = new UriRequest(requestId, url, referrer, type, parentSpan); + public static UriRequest createRoot(String requestId, String method, String url, String referrer, ResourceType type, String initialDiscoveryPath, BaseSpan parentSpan) { + UriRequest result = new UriRequest(requestId, method, url, referrer, type, parentSpan); result.discoveryPath = initialDiscoveryPath; result.rootResource = true; return result; } - public static UriRequest create(String requestId, String url, String referrer, ResourceType type, char discoveryType, UriRequest parent, BaseSpan parentSpan) { - UriRequest result = new UriRequest(requestId, url, referrer, type, parentSpan); + public static UriRequest create(String requestId, String method, String url, String referrer, ResourceType type, char discoveryType, UriRequest parent, BaseSpan parentSpan) { + UriRequest result = new UriRequest(requestId, method, url, referrer, type, parentSpan); result.setParent(parent); result.discoveryPath = parent.discoveryPath + discoveryType; return result; @@ -153,7 +157,7 @@ public static UriRequest create(NetworkDomain.RequestWillBeSent request, UriRequ } } - result = create(request.requestId(), request.request().url(), parent.getUrl(), + result = create(request.requestId(), request.request().method(), request.request().url(), parent.getUrl(), ResourceType.forName(request.type()), discoveryType, parent, parentSpan); return result; @@ -199,6 +203,10 @@ public void setMimeType(String mimeType) { .isPresent(); } + public String getMethod() { + return method; + } + public String getUrl() { return url; } @@ -413,6 +421,7 @@ public String toString(String indent) { sb.append(", status=").append(statusCode); sb.append(", path='").append(discoveryPath).append('\''); sb.append(", renderable=").append(renderable); + sb.append(", method='").append(method).append('\''); sb.append(", url='").append(url).append('\''); sb.append(", referrer=").append(referrer); sb.append(", fromCache=").append(isFromCache()); diff --git a/veidemann-harvester/src/main/java/no/nb/nna/veidemann/harvester/browsercontroller/UriRequestRegistry.java b/veidemann-harvester/src/main/java/no/nb/nna/veidemann/harvester/browsercontroller/UriRequestRegistry.java index cb1fcb39..006b49e2 100644 --- a/veidemann-harvester/src/main/java/no/nb/nna/veidemann/harvester/browsercontroller/UriRequestRegistry.java +++ b/veidemann-harvester/src/main/java/no/nb/nna/veidemann/harvester/browsercontroller/UriRequestRegistry.java @@ -175,7 +175,7 @@ void onRequestWillBeSent(NetworkDomain.RequestWillBeSent request) { } }).otherwise(() -> { MDC.put("uri", request.request().url()); - LOG.debug("Request will be sent: {}", request.requestId()); + LOG.debug("Request will be sent: {} {} {}, priority: {}", request.requestId(), request.request().method(), request.request().url(), request.request().initialPriority()); UriRequest uriRequest; if (getRootRequest() == null) { @@ -226,9 +226,7 @@ void onLoadingFailed(NetworkDomain.LoadingFailed f) { LOG.debug("Could not create tls tunnel for resource: Error '{}', Blocked reason '{}'", f.errorText(), f.blockedReason()); request.setStatusCode(ExtraStatusCodes.CONNECT_FAILED.getCode()); } else { - - LOG.error( - "Failed fetching page: Error '{}', Blocked reason '{}', Resource type: '{}', Canceled: {}, Req: {}, Req Id: {}", + LOG.error("Failed fetching page: Error '{}', Blocked reason '{}', Resource type: '{}', Canceled: {}, Req: {}, Req Id: {}", f.errorText(), f.blockedReason(), f.type(), f.canceled(), request.getUrl(), f.requestId()); } } else { diff --git a/veidemann-harvester/src/main/java/no/nb/nna/veidemann/harvester/settings/Settings.java b/veidemann-harvester/src/main/java/no/nb/nna/veidemann/harvester/settings/Settings.java index e729590c..0210dc6f 100644 --- a/veidemann-harvester/src/main/java/no/nb/nna/veidemann/harvester/settings/Settings.java +++ b/veidemann-harvester/src/main/java/no/nb/nna/veidemann/harvester/settings/Settings.java @@ -55,6 +55,8 @@ public class Settings extends CommonSettings { private int robotsTxtEvaluatorPort; + private boolean headlessBrowser; + public String getProxyHost() { return proxyHost; } @@ -190,4 +192,12 @@ public int getRobotsTxtEvaluatorPort() { public void setRobotsTxtEvaluatorPort(int robotsTxtEvaluatorPort) { this.robotsTxtEvaluatorPort = robotsTxtEvaluatorPort; } + + public boolean isHeadlessBrowser() { + return headlessBrowser; + } + + public void setHeadlessBrowser(boolean headlessBrowser) { + this.headlessBrowser = headlessBrowser; + } } diff --git a/veidemann-harvester/src/main/jib/app/resources/application.conf b/veidemann-harvester/src/main/jib/app/resources/application.conf index 0ae1fc75..da5f7e72 100644 --- a/veidemann-harvester/src/main/jib/app/resources/application.conf +++ b/veidemann-harvester/src/main/jib/app/resources/application.conf @@ -59,4 +59,7 @@ cachePort=3128 cachePort=${?CACHE_PORT} maxOpenSessions=1 -maxOpenSessions=${?MAX_OPEN_SESSIONS} \ No newline at end of file +maxOpenSessions=${?MAX_OPEN_SESSIONS} + +headlessBrowser=true +headlessBrowser=${?HEADLESS_BROWSER}