Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Validate citation identifiers conform to known identifier patterns #76

Merged
merged 14 commits into from
Nov 13, 2018
Merged
5 changes: 4 additions & 1 deletion manubot/cite/cite_command.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
citation_to_citeproc,
standardize_citation,
)
from manubot.cite.util import is_valid_citation_string

# For manubot cite, infer --format from --output filename extensions
extension_to_format = {
Expand Down Expand Up @@ -71,8 +72,10 @@ def cli_cite(args):
# generate CSL JSON data
csl_list = list()
for citation in args.citations:
citation = standardize_citation(citation)
try:
if not is_valid_citation_string(f'@{citation}'):
continue
agitter marked this conversation as resolved.
Show resolved Hide resolved
citation = standardize_citation(citation)
csl_item = citation_to_citeproc(citation, prune=args.prune_csl)
csl_list.append(csl_item)
except Exception as error:
Expand Down
63 changes: 58 additions & 5 deletions manubot/cite/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,19 +42,67 @@ def standardize_citation(citation):
return f'{source}:{identifier}'


regexes = {
'pmid': re.compile(r'[1-9][0-9]{0,7}'),
'doi': re.compile(r'10\.[0-9]{4,9}/\S+'),
}


def inspect_citation_identifier(citation):
"""
Check citation identifiers adhere to expected formats. If an issue is
detected a string describing the issue is returned. Otherwise returns None.
"""
source, identifier = citation.split(':', 1)
if source == 'pmid':
# https://www.nlm.nih.gov/bsd/mms/medlineelements.html#pmid
if identifier.startswith('PMC'):
return (
'PubMed Identifiers should start with digits rather than PMC. '
f'Should {citation} switch the citation source to `pmcid`?'
)
elif not regexes['pmid'].fullmatch(identifier):
return 'PubMed Identifiers should be 1-8 digits with no leading zeros.'
if source == 'pmcid':
# https://www.nlm.nih.gov/bsd/mms/medlineelements.html#pmc
if not identifier.startswith('PMC'):
return 'PubMed Central Identifiers must start with `PMC`.'
if source == 'doi':
# https://www.crossref.org/blog/dois-and-matching-regular-expressions/
if not identifier.startswith('10.'):
return (
'DOIs must start with `10.`.'
)
elif not regexes['doi'].fullmatch(identifier):
return (
'identifier does not conform to the DOI regex. '
'Double check the DOI.'
)
return None


def is_valid_citation_string(string):
"""
Return True if the citation string is a properly formatted citation.
Return False if improperly formatted or a non-citation.
Return True if string is a properly formatted citation. Return False if
string is not a citation (i.e. it's an exempt category such as @fig) or is
an invalid citation. In the case string is an invalid citation, an error is
logged. This function does not catch all invalid citations, but instead
performs cursory checks, such as citations adhere to the expected formats.
No calls to external resources are used by these checks, so they will not
detect citations to non-existent identifiers unless those identifiers
violate their source's syntax.
"""
if not string.startswith('@'):
logging.error(f'{string} → does not start with @')
return False

citation = string[1:]
try:
source, identifier = string.lstrip('@').split(':', 1)
source, identifier = citation.split(':', 1)
except ValueError as e:
logging.error(f'Citation not splittable: {string}')
logging.error(
f'Citation not splittable via a single colon: {string}. '
'Citation strings must be in the format of `@source:identifier`.'
)
return False

if not source or not identifier:
Expand All @@ -72,6 +120,11 @@ def is_valid_citation_string(string):
logging.error(f'{string} → source "{source}" is not valid')
agitter marked this conversation as resolved.
Show resolved Hide resolved
return False

inspection = inspect_citation_identifier(citation)
if inspection:
logging.error(f'invalid {source} citation: {string}\n{inspection}')
return False

return True


Expand Down
3 changes: 2 additions & 1 deletion manubot/process/manuscript.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,8 @@

def get_citation_strings(text):
"""
Extract the deduplicated list of citations in a text
Extract the deduplicated list of citations in a text. Citations that are
clearly invalid such as `doi:/453` are not returned.
"""
citations_strings = set(citation_pattern.findall(text))
citations_strings = filter(is_valid_citation_string, citations_strings)
Expand Down
7 changes: 6 additions & 1 deletion manubot/process/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -230,7 +230,12 @@ def get_citation_df(args, text):
tag_df = pandas.read_table(args.citation_tags_path)
na_rows_df = tag_df[tag_df.isnull().any(axis='columns')]
if not na_rows_df.empty:
logging.error(f'{args.citation_tags_path} contains rows with missing values:\n{na_rows_df}\nThis error can be caused by using spaces rather than tabs to delimit fields.\nProceeding to reread TSV with delim_whitespace=True.') # noqa: E501
logging.error(
f'{args.citation_tags_path} contains rows with missing values:\n'
f'{na_rows_df}\n'
'This error can be caused by using spaces rather than tabs to delimit fields.\n'
'Proceeding to reread TSV with delim_whitespace=True.'
)
tag_df = pandas.read_table(args.citation_tags_path, delim_whitespace=True)
tag_df['string'] = '@tag:' + tag_df.tag
for citation in tag_df.citation:
Expand Down
31 changes: 31 additions & 0 deletions tests/test_citations.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
citation_pattern,
citation_to_citeproc,
get_citation_id,
inspect_citation_identifier,
standardize_citation,
)
from manubot.cite.pubmed import (
Expand Down Expand Up @@ -69,6 +70,36 @@ def test_standardize_citation(citation, expected):
assert output == expected


@pytest.mark.parametrize('citation', [
'doi:10.7717/peerj.705',
'pmcid:PMC4304851',
'pmid:25648772',
'arxiv:1407.3561',
'url:https://peerj.com/articles/705/',
])
def test_inspect_citation_identifier_passes(citation):
"""
These citations should pass inspection by inspect_citation_identifier.
"""
assert inspect_citation_identifier(citation) is None


@pytest.mark.parametrize(['citation', 'contains'], [
('doi:10.771/peerj.705', 'Double check the DOI'),
('doi:7717/peerj.705', 'must start with `10.`'),
('pmcid:25648772', 'must start with `PMC`'),
('pmid:PMC4304851', 'Should pmid:PMC4304851 switch the citation source to `pmcid`?'),
])
def test_inspect_citation_identifier_fails(citation, contains):
"""
These citations should fail inspection by inspect_citation_identifier.
"""
report = inspect_citation_identifier(citation)
assert report is not None
assert isinstance(report, str)
assert contains in report


@pytest.mark.xfail(reason='https://twitter.com/dhimmel/status/950443969313419264')
def test_citation_to_citeproc_doi_datacite():
citation = 'doi:10.7287/peerj.preprints.3100v1'
Expand Down