From cd2d1287994de0b7eeabd4b0337fc8fde27bc8a9 Mon Sep 17 00:00:00 2001 From: Benjamin Ooghe-Tabanou Date: Mon, 28 Jun 2021 18:46:11 +0200 Subject: [PATCH] fix issue with startpages not displayed as crawled from archives (#372) --- hyphe_backend/core.tac | 2 +- hyphe_backend/lib/mongo.py | 7 +++++-- hyphe_frontend/app/views/monitorCrawls.html | 2 +- hyphe_frontend/app/views/webentity.html | 4 ++-- hyphe_frontend/app/views/webentity.js | 10 +++++++--- 5 files changed, 16 insertions(+), 9 deletions(-) diff --git a/hyphe_backend/core.tac b/hyphe_backend/core.tac index b88457bc..b31c4ca9 100644 --- a/hyphe_backend/core.tac +++ b/hyphe_backend/core.tac @@ -2940,7 +2940,7 @@ class Memory_Structure(customJSONRPC): page_data = None if include_page_metas or include_page_body: - page_data = yield self.db.get_pages(corpus, [urllru.lru_to_url(p['lru']) for p in pages['pages']], include_metas=include_page_metas, include_body=include_page_body) + page_data = yield self.db.get_pages(corpus, [p['lru'] for p in pages['pages']], include_metas=include_page_metas, include_body=include_page_body) returnD(format_result({ 'token': token, diff --git a/hyphe_backend/lib/mongo.py b/hyphe_backend/lib/mongo.py index e91a5a2f..8a399ec0 100644 --- a/hyphe_backend/lib/mongo.py +++ b/hyphe_backend/lib/mongo.py @@ -366,7 +366,7 @@ def count_pages(self, corpus, job, **kwargs): returnD(tot) @inlineCallbacks - def get_pages(self, corpus, urls, include_metas=False, include_body=False, include_links=False): + def get_pages(self, corpus, urls_or_lrus, include_metas=False, include_body=False, include_links=False): projection = {} if not include_links: @@ -387,7 +387,10 @@ def get_pages(self, corpus, urls, include_metas=False, include_body=False, inclu if projection: kwargs["projection"] = projection - result = yield self.pages(corpus).find({"url": {"$in": urls}}, **kwargs) + if urls_or_lrus[0].startswith("s:"): + result = yield self.pages(corpus).find({"lru": {"$in": urls_or_lrus}}, **kwargs) + else: + result = yield self.pages(corpus).find({"url": {"$in": urls_or_lrus}}, **kwargs) returnD(result) @inlineCallbacks diff --git a/hyphe_frontend/app/views/monitorCrawls.html b/hyphe_frontend/app/views/monitorCrawls.html index 8d14df9a..8e3ecafc 100644 --- a/hyphe_frontend/app/views/monitorCrawls.html +++ b/hyphe_frontend/app/views/monitorCrawls.html @@ -126,7 +126,7 @@

No crawl job

{{ job.crawl_arguments.webarchives.days_range }} days around {{ job.crawl_arguments.webarchives.date }} - history using {{job.crawl_arguments.webarchives.option}} + history from {{job.crawl_arguments.webarchives.option}}
diff --git a/hyphe_frontend/app/views/webentity.html b/hyphe_frontend/app/views/webentity.html index 6c65f9be..237afe46 100644 --- a/hyphe_frontend/app/views/webentity.html +++ b/hyphe_frontend/app/views/webentity.html @@ -354,7 +354,7 @@

{{ job.crawl_arguments.webarchives.days_range }} days around {{ job.crawl_arguments.webarchives.date }} - history using {{job.crawl_arguments.webarchives.option}} + history from {{job.crawl_arguments.webarchives.option}}
@@ -491,7 +491,7 @@

Open in a new tab link - + Open in a new tab archived page from {{ page.archive_date_obtained }} history diff --git a/hyphe_frontend/app/views/webentity.js b/hyphe_frontend/app/views/webentity.js index a1457c1d..df01adcd 100644 --- a/hyphe_frontend/app/views/webentity.js +++ b/hyphe_frontend/app/views/webentity.js @@ -168,6 +168,7 @@ angular.module('hyphe.webentityController', []) } else if (!$scope.pagesToken) { $scope.status = {message: 'Loading pages 0 %', progress: 0} } + $scope.webentity.startpages_lrus = $scope.webentity.startpages.map(utils.URL_to_LRU) api.getPaginatedPages({ webentityId: $scope.webentity.id ,includePageMetas: true @@ -175,16 +176,19 @@ angular.module('hyphe.webentityController', []) } ,function(result){ var pagesBatch = [] + var required_fields = ["crawled", "archive_url", "archive_date_obtained", "archive_date_requested"] result.pages.forEach(function(page){ if (page.archive_url && page.archive_date_obtained) { page.archive_date_obtained = page.archive_date_obtained.replace(/^(....)(..)(..).*$/, "$1-$2-$3") } - if (!$scope.webentity.startpages.includes(page.url)) { + if (!$scope.webentity.startpages_lrus.includes(page.lru)) { pagesBatch.push(page) } else { for (var p in $scope.pages) { - if ($scope.pages[p].url === page.url) { - $scope.pages[p].crawled = page.crawled + if ($scope.pages[p].lru === page.lru) { + for (var field in required_fields) { + $scope.pages[p][required_fields[field]] = page[required_fields[field]] + } break } }