Address feedback on #3462

Signed-off-by: Ayan Sinha Mahapatra <ayansmahapatra@gmail.com>
nexB · Jul 24, 2023 · 8697b3f · 8697b3f
1 parent 35d8f6b
commit 8697b3f
Show file tree

Hide file tree

Showing 9 changed files with 54 additions and 37 deletions.
diff --git a/docs/source/reference/available_package_parsers.rst b/docs/source/reference/available_package_parsers.rst
@@ -23,12 +23,6 @@ parsers in scancode-toolkit during documentation builds.
      - Datasource ID
      - Primary Language
      - Documentation URL
-   * - JAR Java Archive
-     - ``*.jar``
-     - None
-     - ``java_jar``
-     - None
-     - https://en.wikipedia.org/wiki/JAR_(file_format)
    * - AboutCode ABOUT file
      - ``*.ABOUT``
      - ``about``
@@ -306,13 +300,13 @@ parsers in scancode-toolkit during documentation builds.
      - ``debian_source_metadata_tarball``
      - None
      - https://manpages.debian.org/unstable/dpkg-dev/deb.5.en.html
-   * - None
+   * - macOS disk image file
      - ``*.dmg``
        ``*.sparseimage``
      - ``dmg``
      - ``apple_dmg``
      - None
-     - None
+     - https://en.wikipedia.org/wiki/Apple_Disk_Image
    * - Java EAR application.xml
      - ``*/META-INF/application.xml``
      - ``ear``
@@ -437,6 +431,12 @@ parsers in scancode-toolkit during documentation builds.
      - ``ant_ivy_xml``
      - Java
      - https://ant.apache.org/ivy/history/latest-milestone/ivyfile.html
+   * - JAR Java Archive
+     - ``*.jar``
+     - ``jar``
+     - ``java_jar``
+     - None
+     - https://en.wikipedia.org/wiki/JAR_(file_format)
    * - Java JAR MANIFEST.MF
      - ``*/META-INF/MANIFEST.MF``
      - ``jar``
@@ -555,6 +555,12 @@ parsers in scancode-toolkit during documentation builds.
      - ``opam_file``
      - Ocaml
      - https://opam.ocaml.org/doc/Manual.html#Common-file-format
+   * - Java OSGi MANIFEST.MF
+     - None
+     - ``osgi``
+     - ``java_osgi_manifest``
+     - Java
+     - https://docs.oracle.com/javase/tutorial/deployment/jar/manifestindex.html
    * - Dart pubspec lockfile
      - ``*pubspec.lock``
      - ``pubspec``

diff --git a/src/licensedcode/detection.py b/src/licensedcode/detection.py
@@ -106,11 +106,10 @@ class DetectionCategory(Enum):
     EXTRA_WORDS = 'extra-words'
     UNKNOWN_MATCH = 'unknown-match'
     LICENSE_CLUES = 'license-clues'
-    LOW_QUALITY_MATCHES = 'license-clues'
+    LOW_QUALITY_MATCH_FRAGMENTS = 'low-quality-matches'
     IMPERFECT_COVERAGE = 'imperfect-match-coverage'
     FALSE_POSITVE = 'possible-false-positive'
     UNDETECTED_LICENSE = 'undetected-license'
-    MATCH_FRAGMENTS = 'match-fragments'
     LOW_RELEVANCE = 'low-relevance'
 
 
@@ -124,6 +123,7 @@ class DetectionRule(Enum):
     """
     UNKNOWN_MATCH = 'unknown-match'
     LICENSE_CLUES = 'license-clues'
+    LOW_QUALITY_MATCH_FRAGMENTS = 'low-quality-matches'
     FALSE_POSITIVE = 'possible-false-positive'
     NOT_LICENSE_CLUES = 'not-license-clues-as-more-detections-present'
     UNKNOWN_REFERENCE_TO_LOCAL_FILE = 'unknown-reference-to-local-file'
@@ -1374,12 +1374,12 @@ def get_detected_license_expression(
         detection_log.append(DetectionRule.LICENSE_CLUES.value)
         return detection_log, combined_expression
 
-    elif analysis == DetectionCategory.LOW_QUALITY_MATCHES.value:
+    elif analysis == DetectionCategory.LOW_QUALITY_MATCH_FRAGMENTS.value:
         if TRACE_ANALYSIS:
             logger_debug(f'analysis {DetectionCategory.LICENSE_CLUES.value}')
         # TODO: we are temporarily returning these as license clues, and not
         # in detections but ideally we should return synthetic unknowns for these
-        detection_log.append(DetectionRule.LOW_QUALITY_MATCHES.value)
+        detection_log.append(DetectionRule.LOW_QUALITY_MATCH_FRAGMENTS.value)
         return detection_log, combined_expression
 
     else:
@@ -1501,7 +1501,7 @@ def get_ambiguous_license_detections_by_type(unique_license_detections):
 
     for detection in unique_license_detections:
         if not detection.license_expression:
-            ambi_license_detections[DetectionCategory.MATCH_FRAGMENTS.value] = detection
+            ambi_license_detections[DetectionCategory.LOW_QUALITY_MATCH_FRAGMENTS.value] = detection
 
         elif is_undetected_license_matches(license_matches=detection.matches):
             ambi_license_detections[DetectionCategory.UNDETECTED_LICENSE.value] = detection
@@ -1567,7 +1567,7 @@ def analyze_detection(license_matches, package_license=False):
         return DetectionCategory.UNKNOWN_MATCH.value
 
     elif not package_license and is_low_quality_matches(license_matches=license_matches):
-        return DetectionCategory.LOW_QUALITY_MATCHES.value
+        return DetectionCategory.LOW_QUALITY_MATCH_FRAGMENTS.value
 
     # Case where at least one of the matches have `match_coverage`
     # below IMPERFECT_MATCH_COVERAGE_THR

diff --git a/src/licensedcode/models.py b/src/licensedcode/models.py
@@ -1193,7 +1193,7 @@ def load_rules(
     rules_data_dir=rules_data_dir,
     with_checks=True,
     is_builtin=True,
-    ignore_deprecated=True,
+    with_depreacted=False,
 ):
     """
     Return an iterable of rules loaded from rule files in ``rules_data_dir``.
@@ -1217,7 +1217,7 @@ def load_rules(
 
             try:
                 rule = Rule.from_file(rule_file=rule_file)
-                if rule.is_deprecated and ignore_deprecated:
+                if not with_depreacted and rule.is_deprecated:
                     continue 
                 else:
                     yield rule

diff --git a/src/packagedcode/misc.py b/src/packagedcode/misc.py
@@ -18,13 +18,12 @@
 # yet the purpose and semantics are rather different here
 
 # TODO: parse me!!!
-# TODO: add missing URLs and descriptions
-
 
 class JavaJarHandler(models.NonAssemblableDatafileHandler):
     datasource_id = 'java_jar'
     # NOTE: there are a few rare cases where a .zip can be a JAR.
     path_patterns = ('*.jar',)
+    default_package_type = 'jar'
     filetypes = ('zip archive', 'java archive',)
     description = 'JAR Java Archive'
     documentation_url = 'https://en.wikipedia.org/wiki/JAR_(file_format)'
@@ -270,8 +269,9 @@ class AppleDmgHandler(models.NonAssemblableDatafileHandler):
     datasource_id = 'apple_dmg'
     default_package_type = 'dmg'
     path_patterns = ('*.dmg', '*.sparseimage',)
-    description = ''
-    documentation_url = ''
+    description = 'macOS disk image file'
+    # See also https://en.wikipedia.org/wiki/Sparse_image
+    documentation_url = 'https://en.wikipedia.org/wiki/Apple_Disk_Image'
 
 
 class IsoImageHandler(models.NonAssemblableDatafileHandler):

diff --git a/src/packagedcode/readme.py b/src/packagedcode/readme.py
@@ -37,6 +37,7 @@
 
     'licence': 'extracted_license_statement',
     'license': 'extracted_license_statement',
+    # This also has License File sometimes
 }
 
 
@@ -50,6 +51,8 @@ class ReadmeHandler(models.NonAssemblableDatafileHandler):
         '*/README.google',
         '*/README.thirdparty',
     )
+    description = ''
+    documentation_url = ''
 
     @classmethod
     def parse(cls, location):

diff --git a/src/summarycode/todo.py b/src/summarycode/todo.py
@@ -213,8 +213,6 @@ def get_package_identifier(package_data, file_path):
 
 
 def get_unknown_purl(package_type):
-    if not package_type:
-        package_type = "unknown"
     purl = PackageURL(type=package_type, name="unknown")
     return purl.to_string()
 
@@ -337,7 +335,7 @@ class ReviewComments(Enum):
         "been matched to rules having unknown as their license key, and these "
         "needs to be reviewed."
     )
-    MATCH_FRAGMENTS = (
+    LOW_QUALITY_MATCH_FRAGMENTS = (
         "Fragments of license text were detected which are not proper license detections "
         "and likely has misleading license expression, but this has some clues about licenses, "
         "which needs review."
@@ -391,8 +389,8 @@ def get_review_comments(detection_log):
     if LicenseDetectionCategory.UNKNOWN_MATCH.value in detection_log:
         review_comments[LicenseDetectionCategory.UNKNOWN_MATCH.value] = ReviewComments.UNKNOWN_MATCH.value
 
-    if LicenseDetectionCategory.MATCH_FRAGMENTS.value in detection_log:
-        review_comments[LicenseDetectionCategory.MATCH_FRAGMENTS.value] = ReviewComments.MATCH_FRAGMENTS.value
+    if LicenseDetectionCategory.LOW_QUALITY_MATCH_FRAGMENTS.value in detection_log:
+        review_comments[LicenseDetectionCategory.LOW_QUALITY_MATCH_FRAGMENTS.value] = ReviewComments.LOW_QUALITY_MATCH_FRAGMENTS.value
 
     if LicenseDetectionCategory.LOW_RELEVANCE.value in detection_log:
         review_comments[LicenseDetectionCategory.LOW_RELEVANCE.value] = ReviewComments.LOW_RELEVANCE.value

diff --git a/tests/packagedcode/data/plugin/help.txt b/tests/packagedcode/data/plugin/help.txt
@@ -1,11 +1,4 @@
 --------------------------------------------
-Package type:  None
-  datasource_id:     java_jar
-  documentation URL: https://en.wikipedia.org/wiki/JAR_(file_format)
-  primary language:  None
-  description:       JAR Java Archive
-  path_patterns:    '*.jar'
---------------------------------------------
 Package type:  about
   datasource_id:     about_file
   documentation URL: https://aboutcode-toolkit.readthedocs.io/en/latest/specification.html
@@ -323,9 +316,9 @@ Package type:  deb
 --------------------------------------------
 Package type:  dmg
   datasource_id:     apple_dmg
-  documentation URL: 
+  documentation URL: https://en.wikipedia.org/wiki/Apple_Disk_Image
   primary language:  None
-  description:       
+  description:       macOS disk image file
   path_patterns:    '*.dmg', '*.sparseimage'
 --------------------------------------------
 Package type:  ear
@@ -468,6 +461,13 @@ Package type:  ivy
   description:       Ant IVY dependency file
   path_patterns:    '*/ivy.xml'
 --------------------------------------------
+Package type:  jar
+  datasource_id:     java_jar
+  documentation URL: https://en.wikipedia.org/wiki/JAR_(file_format)
+  primary language:  None
+  description:       JAR Java Archive
+  path_patterns:    '*.jar'
+--------------------------------------------
 Package type:  jar
   datasource_id:     java_jar_manifest
   documentation URL: https://docs.oracle.com/javase/tutorial/deployment/jar/manifestindex.html
@@ -715,9 +715,9 @@ Package type:  pypi
 --------------------------------------------
 Package type:  readme
   datasource_id:     readme
-  documentation URL: None
+  documentation URL: 
   primary language:  None
-  description:       None
+  description:       
   path_patterns:    '*/README.android', '*/README.chromium', '*/README.facebook', '*/README.google', '*/README.thirdparty'
 --------------------------------------------
 Package type:  rpm

diff --git a/tests/packagedcode/test_package_models.py b/tests/packagedcode/test_package_models.py
@@ -153,6 +153,16 @@ def test_package_data_datasource_id_are_unique(self):
             ), f'Duplicated datasource_id: {pdh!r} with {seen[pdhid]!r}'
             seen[pdh.datasource_id] = pdh
 
+    def test_package_data_handlers_have_package_type(self):
+        """
+        Check that we do not have two DataFileHandlers with the same
+        datasource_id and that all have one.
+        """
+        for pdh in ALL_DATAFILE_HANDLERS:
+            pdh_type = pdh.default_package_type
+            assert pdh_type
+
+
     def test_package_data_file_patterns_are_tuples(self):
         """
         Check that all file patterns are tuples, as if they are

diff --git a/...s/summarycode/data/todo/todo_present/README.multi-orig-tarball-package-expected-diag.json b/...s/summarycode/data/todo/todo_present/README.multi-orig-tarball-package-expected-diag.json
@@ -24,7 +24,7 @@
           }
         ],
         "detection_log": [
-          "license-clues"
+          "low-quality-matches"
         ],
         "identifier": "borceux-3c39742c-edef-82b7-0cdd-fc4d9ff8b044"
       }