Skip to content

Commit

Permalink
removed some debug prints; removed a condition which checked for html…
Browse files Browse the repository at this point in the history
… content which caused #492 and also caused unwanted behaviour for datacite (doi cc) exclusion; changed some logger messages; added missing pid_url handover from external metadata harvester which caused #492; fixed typo in metrics yaml; changed version to 3.2.0; added a file touch after failed datacite id update to avoid #489
  • Loading branch information
huberrob committed Mar 22, 2024
1 parent e81833c commit 3a9b01c
Show file tree
Hide file tree
Showing 7 changed files with 34 additions and 22 deletions.
6 changes: 4 additions & 2 deletions fuji_server/controllers/fair_check.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ def __init__(
self.pid_url = None # full pid # e.g., "https://doi.org/10.1594/pangaea.906092 or url (non-pid)
self.landing_url = None # url of the landing page of self.pid_url
self.origin_url = None # the url from where all starts - in case of redirection we'll need this later on
self.repository_urls = [] # urls identified which could represent the repository
self.repository_urls = [] # urls identified which could represent the repository will need this probably for FAIRiCAT things
self.landing_html = None
self.landing_content_type = None
self.landing_origin = None # schema + authority of the landing page e.g. https://www.pangaea.de
Expand Down Expand Up @@ -388,6 +388,8 @@ def retrieve_metadata_external(self, target_url=None, repeat_mode=False):
self.linked_namespace_uri.update(self.metadata_harvester.linked_namespace_uri)
self.related_resources.extend(self.metadata_harvester.related_resources)
self.metadata_harvester.get_signposting_object_identifier()
self.pid_url = self.metadata_harvester.pid_url
self.pid_scheme = self.metadata_harvester.pid_scheme
self.pid_collector.update(self.metadata_harvester.pid_collector)

"""def lookup_metadatastandard_by_name(self, value):
Expand Down Expand Up @@ -648,4 +650,4 @@ def set_repository_uris(self):
self.repository_urls.append(publisher_url)
if self.repository_urls:
self.repository_urls = list(set(self.repository_urls))
print("REPOSITORY: ", self.repository_urls)
# print("REPOSITORY: ", self.repository_urls)
2 changes: 1 addition & 1 deletion fuji_server/harvester/data_harvester.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,7 @@ def retrieve_all_data(self, scan_content=True):
timeout = 10
if len(ft) > self.max_number_per_mime:
self.logger.warning(
f"FsF-F3-01M : Found more than -: {self.max_number_per_mime!s} data links (out of {len(ft)!s}) of type {fmime} will only take {self.max_number_per_mime!s}"
f"FsF-F3-01M : Found more than -: {self.max_number_per_mime!s} data links (out of {len(ft)!s}) of type {fmime} will only take {self.max_number_per_mime!s} for content analysis"
)
files_to_check = ft[: self.max_number_per_mime]
# add the fifth one for compatibility reasons < f-uji 3.0.1, when we took the last of list of length FILES_LIMIT
Expand Down
39 changes: 24 additions & 15 deletions fuji_server/harvester/metadata_harvester.py
Original file line number Diff line number Diff line change
Expand Up @@ -263,17 +263,22 @@ def check_if_pid_resolves_to_landing_page(self, pid_url=None):
candidate_landing_url = self.pid_collector[pid_url].get("resolved_url")
if candidate_landing_url and self.landing_url:
candidate_landing_url_parts = extract(candidate_landing_url)
# print(candidate_landing_url_parts )
# landing_url_parts = extract(self.landing_url)
input_id_domain = candidate_landing_url_parts.domain + "." + candidate_landing_url_parts.suffix
# landing_domain = landing_url_parts.domain + "." + landing_url_parts.suffix
if self.landing_domain != input_id_domain:
self.logger.warning(
"FsF-F1-02D : Landing page domain resolved from PID found in metadata does not match with input URL domain -:"
+ str(pid_url)
+ str(self.landing_domain)
+ " <> "
+ str(input_id_domain)
)
self.logger.warning(
"FsF-F2-01M : Landing page domain resolved from PID found in metadata does not match with input URL domain -:"
+ str(pid_url)
+ str(self.landing_domain)
+ " <> "
+ str(input_id_domain)
)
return False
else:
Expand Down Expand Up @@ -322,6 +327,7 @@ def check_pidtest_repeat(self):
if idhelper.is_persistent and validated:
found_pids[found_id_scheme] = idhelper.get_identifier_url()
if len(found_pids) >= 1 and self.repeat_pid_check is False:
# print(found_pids, next(iter(found_pids.items())))
self.logger.info(
"FsF-F2-01M : Found object identifier in metadata, repeating PID check for FsF-F1-02D"
)
Expand Down Expand Up @@ -702,17 +708,17 @@ def retrieve_metadata_embedded(self):
self.logger.error("FsF-F2-01M : Resource inaccessible -: " + str(e))
pass

if self.landing_url and self.is_html_page:
if self.landing_url:
if self.landing_url not in ["https://datacite.org/invalid.html"]:
if response_status == 200:
if "html" in requestHelper.content_type:
self.raise_warning_if_javascript_page(requestHelper.response_content)

up = urlparse(self.landing_url)
upp = extract(self.landing_url)
self.landing_origin = f"{up.scheme}://{up.netloc}"
self.landing_domain = upp.domain + "." + upp.suffix
self.landing_html = requestHelper.getResponseContent()
if self.is_html_page:
self.landing_html = requestHelper.getResponseContent()
self.landing_content_type = requestHelper.content_type
self.landing_redirect_list = requestHelper.redirect_list
self.landing_redirect_status_list = requestHelper.redirect_status_list
Expand Down Expand Up @@ -1441,16 +1447,19 @@ def retrieve_metadata_external(self, target_url=None, repeat_mode=False):
target_url_list = [self.origin_url, self.landing_url]
# specific target url
if isinstance(target_url, str):
target_url_list = [target_url]

target_url_list = set(tu for tu in target_url_list if tu is not None)
self.retrieve_metadata_external_xml_negotiated(target_url_list)
self.retrieve_metadata_external_schemaorg_negotiated(target_url_list)
self.retrieve_metadata_external_rdf_negotiated(target_url_list)
self.retrieve_metadata_external_datacite()
if not repeat_mode:
self.retrieve_metadata_external_linked_metadata()
self.retrieve_metadata_external_oai_ore()
if self.use_datacite is False and "doi" == self.pid_scheme:
target_url_list = []
else:
target_url_list = [target_url]
if target_url_list:
target_url_list = set(tu for tu in target_url_list if tu is not None)
self.retrieve_metadata_external_xml_negotiated(target_url_list)
self.retrieve_metadata_external_schemaorg_negotiated(target_url_list)
self.retrieve_metadata_external_rdf_negotiated(target_url_list)
self.retrieve_metadata_external_datacite()
if not repeat_mode:
self.retrieve_metadata_external_linked_metadata()
self.retrieve_metadata_external_oai_ore()

"""if self.reference_elements:
self.logger.debug(f"FsF-F2-01M : Reference metadata elements NOT FOUND -: {self.reference_elements}")
Expand Down
2 changes: 1 addition & 1 deletion fuji_server/helper/metadata_collector_rdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -1008,7 +1008,7 @@ def get_dcat_metadata(self, graph):

datasets = list(graph[: RDF.type : DCAT.Dataset])
table = list(graph[: RDF.type : CSVW.Column])
print("TABLE", len(table))
# print("TABLE", len(table))
if len(datasets) > 1:
self.logger.info("FsF-F2-01M : Found more than one DCAT Dataset description, will use first one")
if len(datasets) > 0:
Expand Down
3 changes: 2 additions & 1 deletion fuji_server/helper/preprocessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -230,7 +230,7 @@ def retrieve_datacite_re3repos(cls):
print("updating re3data dois")
p = {"query": "re3data_id:*"}
try:
req = requests.get(cls.DATACITE_API_REPO, params=p, headers=cls.header)
req = requests.get(cls.DATACITE_API_REPO, params=p, headers=cls.header, timeout=5)
raw = req.json()
for r in raw["data"]:
cls.re3repositories[r["id"]] = r["attributes"]["re3data"]
Expand All @@ -245,6 +245,7 @@ def retrieve_datacite_re3repos(cls):
yaml.dump(cls.re3repositories, f2)

except requests.exceptions.RequestException as e:
os.utime(re3dict_path)
print("Preprocessor Error: " + str(e))
cls.logger.error(e)

Expand Down
2 changes: 1 addition & 1 deletion fuji_server/yaml/metrics_v0.5.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -187,7 +187,7 @@ metrics:
metric_test_score: 1
metric_test_maturity: 3
metric_test_requirements:
- target: http://f-uji.net/vocab/metadata/sources
- target: http://f-uji.net/vocab/metadata/standard
modality: any
required:
name:
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ license = "MIT"
name = "fuji"
readme = "README.md"
requires-python = "~=3.11" # at the moment only Python 3.11 is supported
version = "3.1.1"
version = "3.2.0"

[project.optional-dependencies]
dev = [
Expand Down

0 comments on commit 3a9b01c

Please sign in to comment.