Skip to content

Commit

Permalink
fix issue with startpages not displayed as crawled from archives (#372)
Browse files Browse the repository at this point in the history
  • Loading branch information
boogheta committed Jun 28, 2021
1 parent 8e04999 commit cd2d128
Show file tree
Hide file tree
Showing 5 changed files with 16 additions and 9 deletions.
2 changes: 1 addition & 1 deletion hyphe_backend/core.tac
Original file line number Diff line number Diff line change
Expand Up @@ -2940,7 +2940,7 @@ class Memory_Structure(customJSONRPC):
page_data = None

if include_page_metas or include_page_body:
page_data = yield self.db.get_pages(corpus, [urllru.lru_to_url(p['lru']) for p in pages['pages']], include_metas=include_page_metas, include_body=include_page_body)
page_data = yield self.db.get_pages(corpus, [p['lru'] for p in pages['pages']], include_metas=include_page_metas, include_body=include_page_body)

returnD(format_result({
'token': token,
Expand Down
7 changes: 5 additions & 2 deletions hyphe_backend/lib/mongo.py
Original file line number Diff line number Diff line change
Expand Up @@ -366,7 +366,7 @@ def count_pages(self, corpus, job, **kwargs):
returnD(tot)

@inlineCallbacks
def get_pages(self, corpus, urls, include_metas=False, include_body=False, include_links=False):
def get_pages(self, corpus, urls_or_lrus, include_metas=False, include_body=False, include_links=False):
projection = {}

if not include_links:
Expand All @@ -387,7 +387,10 @@ def get_pages(self, corpus, urls, include_metas=False, include_body=False, inclu
if projection:
kwargs["projection"] = projection

result = yield self.pages(corpus).find({"url": {"$in": urls}}, **kwargs)
if urls_or_lrus[0].startswith("s:"):
result = yield self.pages(corpus).find({"lru": {"$in": urls_or_lrus}}, **kwargs)
else:
result = yield self.pages(corpus).find({"url": {"$in": urls_or_lrus}}, **kwargs)
returnD(result)

@inlineCallbacks
Expand Down
2 changes: 1 addition & 1 deletion hyphe_frontend/app/views/monitorCrawls.html
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,7 @@ <h3>No crawl job</h3>
</div>
<div ng-show="job.crawl_arguments.webarchives.option" class="subtitle">
<md-tooltip md-direction="bottom">{{ job.crawl_arguments.webarchives.days_range }} days around {{ job.crawl_arguments.webarchives.date }}</md-tooltip>
<md-icon>history</md-icon> using {{job.crawl_arguments.webarchives.option}}
<md-icon>history</md-icon> from {{job.crawl_arguments.webarchives.option}}
</div>
</div>
</div>
Expand Down
4 changes: 2 additions & 2 deletions hyphe_frontend/app/views/webentity.html
Original file line number Diff line number Diff line change
Expand Up @@ -354,7 +354,7 @@ <h1 class="word-break">
</div>
<div ng-show="job.crawl_arguments.webarchives.option" class="subtitle">
<md-tooltip md-direction="bottom">{{ job.crawl_arguments.webarchives.days_range }} days around {{ job.crawl_arguments.webarchives.date }}</md-tooltip>
<md-icon>history</md-icon> using {{job.crawl_arguments.webarchives.option}}
<md-icon>history</md-icon> from {{job.crawl_arguments.webarchives.option}}
</div>
</div>
</div>
Expand Down Expand Up @@ -491,7 +491,7 @@ <h3 style="padding: 8px; margin: 0px" ng-if="webentity.pages_total>=1">
<md-tooltip md-direction="left">Open in a new tab</md-tooltip>
<md-icon>link</md-icon>
</a>
<a ng-if="page.archive_url" href="{{page.archive_url}}" target="_blank">
<a ng-if="page.archive_url && page.archive_date_obtained" href="{{page.archive_url}}" target="_blank">
<md-tooltip md-direction="left">Open in a new tab archived page from {{ page.archive_date_obtained }}</md-tooltip>
<md-icon>history</md-icon>
</a>
Expand Down
10 changes: 7 additions & 3 deletions hyphe_frontend/app/views/webentity.js
Original file line number Diff line number Diff line change
Expand Up @@ -168,23 +168,27 @@ angular.module('hyphe.webentityController', [])
} else if (!$scope.pagesToken) {
$scope.status = {message: 'Loading pages 0 %', progress: 0}
}
$scope.webentity.startpages_lrus = $scope.webentity.startpages.map(utils.URL_to_LRU)
api.getPaginatedPages({
webentityId: $scope.webentity.id
,includePageMetas: true
,token: $scope.pagesToken
}
,function(result){
var pagesBatch = []
var required_fields = ["crawled", "archive_url", "archive_date_obtained", "archive_date_requested"]
result.pages.forEach(function(page){
if (page.archive_url && page.archive_date_obtained) {
page.archive_date_obtained = page.archive_date_obtained.replace(/^(....)(..)(..).*$/, "$1-$2-$3")
}
if (!$scope.webentity.startpages.includes(page.url)) {
if (!$scope.webentity.startpages_lrus.includes(page.lru)) {
pagesBatch.push(page)
} else {
for (var p in $scope.pages) {
if ($scope.pages[p].url === page.url) {
$scope.pages[p].crawled = page.crawled
if ($scope.pages[p].lru === page.lru) {
for (var field in required_fields) {
$scope.pages[p][required_fields[field]] = page[required_fields[field]]
}
break
}
}
Expand Down

0 comments on commit cd2d128

Please sign in to comment.