manubot · dhimmel · Nov 13, 2018 · Oct 19, 2018 · Oct 19, 2018 · Oct 24, 2018
diff --git a/manubot/cite/cite_command.py b/manubot/cite/cite_command.py
@@ -9,6 +9,7 @@
     citation_to_citeproc,
     standardize_citation,
 )
+from manubot.cite.util import is_valid_citation_string
 
 # For manubot cite, infer --format from --output filename extensions
 extension_to_format = {
@@ -71,8 +72,10 @@ def cli_cite(args):
     # generate CSL JSON data
     csl_list = list()
     for citation in args.citations:
-        citation = standardize_citation(citation)
         try:
+            if not is_valid_citation_string(f'@{citation}'):
+                continue
+            citation = standardize_citation(citation)
             csl_item = citation_to_citeproc(citation, prune=args.prune_csl)
             csl_list.append(csl_item)
         except Exception as error:

diff --git a/manubot/cite/util.py b/manubot/cite/util.py
@@ -42,19 +42,67 @@ def standardize_citation(citation):
     return f'{source}:{identifier}'
 
 
+regexes = {
+    'pmid': re.compile(r'[1-9][0-9]{0,7}'),
+    'doi': re.compile(r'10\.[0-9]{4,9}/\S+'),
+}
+
+
+def inspect_citation_identifier(citation):
+    """
+    Check citation identifiers adhere to expected formats. If an issue is
+    detected a string describing the issue is returned. Otherwise returns None.
+    """
+    source, identifier = citation.split(':', 1)
+    if source == 'pmid':
+        # https://www.nlm.nih.gov/bsd/mms/medlineelements.html#pmid
+        if identifier.startswith('PMC'):
+            return (
+                'PubMed Identifiers should start with digits rather than PMC. '
+                f'Should {citation} switch the citation source to `pmcid`?'
+            )
+        elif not regexes['pmid'].fullmatch(identifier):
+            return 'PubMed Identifiers should be 1-8 digits with no leading zeros.'
+    if source == 'pmcid':
+        # https://www.nlm.nih.gov/bsd/mms/medlineelements.html#pmc
+        if not identifier.startswith('PMC'):
+            return 'PubMed Central Identifiers must start with `PMC`.'
+    if source == 'doi':
+        # https://www.crossref.org/blog/dois-and-matching-regular-expressions/
+        if not identifier.startswith('10.'):
+            return (
+                'DOIs must start with `10.`.'
+            )
+        elif not regexes['doi'].fullmatch(identifier):
+            return (
+                'identifier does not conform to the DOI regex. '
+                'Double check the DOI.'
+            )
+    return None
+
+
 def is_valid_citation_string(string):
     """
-    Return True if the citation string is a properly formatted citation.
-    Return False if improperly formatted or a non-citation.
+    Return True if string is a properly formatted citation. Return False if
+    string is not a citation (i.e. it's an exempt category such as @fig) or is
+    an invalid citation. In the case string is an invalid citation, an error is
+    logged. This function does not catch all invalid citations, but instead
+    performs cursory checks, such as citations adhere to the expected formats.
+    No calls to external resources are used by these checks, so they will not
+    detect citations to non-existent identifiers unless those identifiers
+    violate their source's syntax.
     """
     if not string.startswith('@'):
         logging.error(f'{string} → does not start with @')
         return False
-
+    citation = string[1:]
     try:
-        source, identifier = string.lstrip('@').split(':', 1)
+        source, identifier = citation.split(':', 1)
     except ValueError as e:
-        logging.error(f'Citation not splittable: {string}')
+        logging.error(
+            f'Citation not splittable via a single colon: {string}. '
+            'Citation strings must be in the format of `@source:identifier`.'
+        )
         return False
 
     if not source or not identifier:
@@ -72,6 +120,11 @@ def is_valid_citation_string(string):
         logging.error(f'{string} → source "{source}" is not valid')
         return False
 
+    inspection = inspect_citation_identifier(citation)
+    if inspection:
+        logging.error(f'invalid {source} citation: {string}\n{inspection}')
+        return False
+
     return True
 
 

diff --git a/manubot/process/manuscript.py b/manubot/process/manuscript.py
@@ -13,7 +13,8 @@
 
 def get_citation_strings(text):
     """
-    Extract the deduplicated list of citations in a text
+    Extract the deduplicated list of citations in a text. Citations that are
+    clearly invalid such as `doi:/453` are not returned.
     """
     citations_strings = set(citation_pattern.findall(text))
     citations_strings = filter(is_valid_citation_string, citations_strings)

diff --git a/manubot/process/util.py b/manubot/process/util.py
@@ -230,7 +230,12 @@ def get_citation_df(args, text):
         tag_df = pandas.read_table(args.citation_tags_path)
         na_rows_df = tag_df[tag_df.isnull().any(axis='columns')]
         if not na_rows_df.empty:
-            logging.error(f'{args.citation_tags_path} contains rows with missing values:\n{na_rows_df}\nThis error can be caused by using spaces rather than tabs to delimit fields.\nProceeding to reread TSV with delim_whitespace=True.')  # noqa: E501
+            logging.error(
+                f'{args.citation_tags_path} contains rows with missing values:\n'
+                f'{na_rows_df}\n'
+                'This error can be caused by using spaces rather than tabs to delimit fields.\n'
+                'Proceeding to reread TSV with delim_whitespace=True.'
+            )
             tag_df = pandas.read_table(args.citation_tags_path, delim_whitespace=True)
         tag_df['string'] = '@tag:' + tag_df.tag
         for citation in tag_df.citation:

diff --git a/tests/test_citations.py b/tests/test_citations.py
@@ -9,6 +9,7 @@
     citation_pattern,
     citation_to_citeproc,
     get_citation_id,
+    inspect_citation_identifier,
     standardize_citation,
 )
 from manubot.cite.pubmed import (
@@ -69,6 +70,36 @@ def test_standardize_citation(citation, expected):
     assert output == expected
 
 
+@pytest.mark.parametrize('citation', [
+    'doi:10.7717/peerj.705',
+    'pmcid:PMC4304851',
+    'pmid:25648772',
+    'arxiv:1407.3561',
+    'url:https://peerj.com/articles/705/',
+])
+def test_inspect_citation_identifier_passes(citation):
+    """
+    These citations should pass inspection by inspect_citation_identifier.
+    """
+    assert inspect_citation_identifier(citation) is None
+
+
+@pytest.mark.parametrize(['citation', 'contains'], [
+    ('doi:10.771/peerj.705', 'Double check the DOI'),
+    ('doi:7717/peerj.705', 'must start with `10.`'),
+    ('pmcid:25648772', 'must start with `PMC`'),
+    ('pmid:PMC4304851', 'Should pmid:PMC4304851 switch the citation source to `pmcid`?'),
+])
+def test_inspect_citation_identifier_fails(citation, contains):
+    """
+    These citations should fail inspection by inspect_citation_identifier.
+    """
+    report = inspect_citation_identifier(citation)
+    assert report is not None
+    assert isinstance(report, str)
+    assert contains in report
+
+
 @pytest.mark.xfail(reason='https://twitter.com/dhimmel/status/950443969313419264')
 def test_citation_to_citeproc_doi_datacite():
     citation = 'doi:10.7287/peerj.preprints.3100v1'