Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We鈥檒l occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refine referenced filenames #3547 #3681

Merged
merged 5 commits into from
Mar 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
74 changes: 74 additions & 0 deletions src/licensedcode/detection.py
Original file line number Diff line number Diff line change
Expand Up @@ -334,6 +334,25 @@ def identifier_with_expression(self):
id_safe_expression = python_safe_name(s=str(self.license_expression))
return "{}-{}".format(id_safe_expression, self._identifier)

@property
def is_unknown(self):
"""
Return True if there are unknown license keys in the license expression
for this detection, return False otherwise.
"""
unknown_license_keys = [
"unknown-license-reference",
"unknown-spdx",
"unknown",
"free-unknown"
]

for license_key in unknown_license_keys:
if license_key in self.license_expression:
return True

return False

def get_start_end_line(self):
"""
Return start and end line for a license detection issue, from the
Expand Down Expand Up @@ -1356,6 +1375,61 @@ def has_references_to_local_files(license_matches):
)


def use_referenced_license_expression(referenced_license_expression, license_detection, licensing=Licensing()):
AyanSinhaMahapatra marked this conversation as resolved.
Show resolved Hide resolved
"""
Return True if the ``license_detection`` LicenseDetection should include
the matches represented by the ``referenced_license_expression`` string.
Return False otherwise.

Used when we have a ``license_detection`` with a match to a license rule like
"See license in COPYING" and where the ``referenced_license_expression`` is the
expression found in the "COPYING" file, which is the combined expression from
all license detections found in "COPYING" (or multiple referenced files).

Reference: https://github.com/nexB/scancode-toolkit/issues/3547
"""
#TODO: Also determing if referenced matches could be added but
# resulting license expression should not be modified.

if not referenced_license_expression or not license_detection:
return False

# We should always include referenced license matches to resolve an unknown
# license reference
if license_detection.is_unknown:
return True

# We should always include referenced license matches when the license
# expression from the referenced license matches match the license
# expression for the detection
if referenced_license_expression == license_detection.license_expression:
return True

license_keys = set(
licensing.license_keys(expression=license_detection.license_expression)
)
referenced_license_keys = set(
licensing.license_keys(expression=referenced_license_expression)
)
same_expression = referenced_license_expression == license_detection.license_expression
same_license_keys = license_keys == referenced_license_keys

# If we have the same license keys but not the same license expression then
# the reference could merely be pointing to notices, combining which produces
# a different expression, and the original detection is correct
if same_license_keys and not same_expression:
return False

# when there are many license keys in an expression, and there are no
# unknown or other cases, we cannot safely conclude that we should
# follow the license in the referenced filenames. This is likely
# a case where we have larger notices and several combined expressions,
if len(referenced_license_keys) > 5:
AyanSinhaMahapatra marked this conversation as resolved.
Show resolved Hide resolved
return False

return True


def get_detected_license_expression(
analysis,
license_matches=None,
Expand Down
70 changes: 55 additions & 15 deletions src/licensedcode/plugin_license.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from commoncode.cliutils import PluggableCommandLineOption
from commoncode.cliutils import SCAN_GROUP
from commoncode.cliutils import SCAN_OPTIONS_GROUP
from license_expression import combine_expressions
from plugincode.scan import ScanPlugin
from plugincode.scan import scan_impl

Expand All @@ -30,10 +31,12 @@
from licensedcode.detection import LicenseDetectionFromResult
from licensedcode.detection import sort_unique_detections
from licensedcode.detection import UniqueDetection
from licensedcode.detection import use_referenced_license_expression
from packagedcode.utils import combine_expressions
from scancode.api import SCANCODE_LICENSEDB_URL

TRACE = os.environ.get('SCANCODE_DEBUG_PLUGIN_LICENSE', False)
TRACE_REFERENCE = os.environ.get('SCANCODE_DEBUG_PLUGIN_LICENSE_REFERENCE', False)


def logger_debug(*args):
Expand All @@ -42,7 +45,7 @@ def logger_debug(*args):

logger = logging.getLogger(__name__)

if TRACE:
if TRACE or TRACE_REFERENCE:
import sys
logging.basicConfig(stream=sys.stdout)
logger.setLevel(logging.DEBUG)
Expand Down Expand Up @@ -214,6 +217,8 @@ def process_codebase(self, codebase, license_text=False, license_diagnostics=Fal
f'before: {license_expressions_before}\n'
f'after : {license_expressions_after}'
)

#raise Exception()

license_detections = collect_license_detections(
codebase=codebase,
Expand Down Expand Up @@ -259,20 +264,28 @@ def add_referenced_filenames_license_matches_for_detections(resource, codebase):

modified = False

if TRACE_REFERENCE:
logger_debug(
f'add_referenced_license_matches: resource_path: {resource.path}',
)

for license_detection_mapping in license_detection_mappings:

license_detection = LicenseDetectionFromResult.from_license_detection_mapping(
license_detection_mapping=license_detection_mapping,
file_path=resource.path,
)
detection_modified = False
detections_added = []
license_match_mappings = license_detection_mapping["matches"]
referenced_filenames = get_referenced_filenames(license_detection.matches)

if not referenced_filenames:
if TRACE_REFERENCE:
logger_debug(
f'No references at license detection with expression: {license_detection.license_expression}',
)
continue

referenced_detections = []
for referenced_filename in referenced_filenames:
referenced_resource = find_referenced_resource(
referenced_filename=referenced_filename,
Expand All @@ -281,26 +294,53 @@ def add_referenced_filenames_license_matches_for_detections(resource, codebase):
)

if referenced_resource and referenced_resource.license_detections:
modified = True
detection_modified = True
detections_added.extend(referenced_resource.license_detections)
matches_to_extend = get_matches_from_detection_mappings(
license_detections=referenced_resource.license_detections
referenced_detections.extend(
referenced_resource.license_detections
)
populate_matches_with_path(
matches=matches_to_extend,
path=referenced_resource.path
)
license_match_mappings.extend(matches_to_extend)

if not detection_modified:
for detection in referenced_resource.license_detections:
populate_matches_with_path(
matches=detection["matches"],
path=referenced_resource.path
)

referenced_license_expression = combine_expressions(
expressions=[
detection["license_expression"]
for detection in referenced_detections
],
)
if not use_referenced_license_expression(
referenced_license_expression=referenced_license_expression,
license_detection=license_detection,
):
if TRACE_REFERENCE:
logger_debug(
f'use_referenced_license_expression: False for '
f'resource: {referenced_resource.path} and '
f'license_expression: {referenced_license_expression}',
)
continue

if TRACE_REFERENCE:
logger_debug(
f'use_referenced_license_expression: True for '
f'resource: {referenced_resource.path} and '
f'license_expression: {referenced_license_expression}',
)

modified = True
matches_to_extend = get_matches_from_detection_mappings(
license_detections=referenced_detections
)
license_match_mappings.extend(matches_to_extend)

detection_log, license_expression = get_detected_license_expression(
license_match_mappings=license_match_mappings,
analysis=DetectionCategory.UNKNOWN_FILE_REFERENCE_LOCAL.value,
post_scan=True,
)

license_expression_spdx = build_spdx_license_expression(
license_expression=str(license_expression),
licensing=get_cache().licensing,
Expand All @@ -310,7 +350,7 @@ def add_referenced_filenames_license_matches_for_detections(resource, codebase):
license_detection_mapping["detection_log"] = detection_log
license_detection_mapping["identifier"] = get_new_identifier_from_detections(
initial_detection=license_detection_mapping,
detections_added=detections_added,
detections_added=referenced_detections,
license_expression=license_expression,
)

Expand Down
78 changes: 59 additions & 19 deletions src/packagedcode/licensing.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
from licensedcode.detection import detect_licenses
from licensedcode.detection import LicenseDetectionFromResult
from licensedcode.detection import populate_matches_with_path
from licensedcode.detection import use_referenced_license_expression
from licensedcode.spans import Span
from licensedcode import query

Expand Down Expand Up @@ -93,41 +94,52 @@ def add_referenced_license_matches_for_package(resource, codebase):
file_path=resource.path,
)

detection_modified = False
detections_added = []
license_match_mappings = license_detection_mapping["matches"]
referenced_filenames = get_referenced_filenames(license_detection_object.matches)
if not referenced_filenames:
continue

referenced_detections = []
for referenced_filename in referenced_filenames:
referenced_resource = find_referenced_resource(
referenced_filename=referenced_filename,
resource=resource,
codebase=codebase,
)

if not referenced_resource:
continue

referenced_license_detections = referenced_resource.license_detections

if referenced_license_detections:
modified = True
detection_modified = True
matches_to_extend = get_matches_from_detection_mappings(
license_detections=referenced_license_detections
if referenced_resource and referenced_resource.license_detections:
referenced_detections.extend(
referenced_resource.license_detections
)

# For LicenseMatches with different resources as origin, add the
# resource path to these matches as origin info
populate_matches_with_path(
matches=matches_to_extend,
path=referenced_resource.path
)
license_match_mappings.extend(matches_to_extend)

if not detection_modified:
for detection in referenced_resource.license_detections:
populate_matches_with_path(
matches=detection["matches"],
path=referenced_resource.path
)

referenced_license_expression = combine_expressions(
expressions=[
detection["license_expression"]
for detection in referenced_detections
],
)
if not use_referenced_license_expression(
referenced_license_expression=referenced_license_expression,
license_detection=license_detection_object,
):
continue

modified = True
detections_added.extend(referenced_resource.license_detections)
matches_to_extend = get_matches_from_detection_mappings(
license_detections=referenced_resource.license_detections,
)
license_match_mappings.extend(matches_to_extend)

detection_log, license_expression = get_detected_license_expression(
license_match_mappings=license_match_mappings,
analysis=DetectionCategory.PACKAGE_UNKNOWN_FILE_REFERENCE_LOCAL.value,
Expand All @@ -142,7 +154,7 @@ def add_referenced_license_matches_for_package(resource, codebase):
license_detection_mapping["detection_log"] = detection_log
license_detection_mapping["identifier"] = get_new_identifier_from_detections(
initial_detection=license_detection_mapping,
detections_added=referenced_license_detections,
detections_added=detections_added,
license_expression=license_expression,
)

Expand Down Expand Up @@ -223,7 +235,20 @@ def add_referenced_license_detection_from_package(resource, codebase):
f'sibling_license_detections: {sibling_license_detections}'
)

referenced_license_expression = combine_expressions(
expressions=[
detection["license_expression"]
for detection in sibling_license_detections
],
)
if not use_referenced_license_expression(
referenced_license_expression=referenced_license_expression,
license_detection=license_detection_object,
):
continue

for sibling_detection in sibling_license_detections:

modified = True
detection_modified = True
license_match_mappings.extend(sibling_detection["matches"])
Expand All @@ -239,6 +264,21 @@ def add_referenced_license_detection_from_package(resource, codebase):
break

pkg_detections = codebase_package["license_detections"]
if not pkg_detections:
continue

referenced_license_expression = combine_expressions(
expressions=[
detection["license_expression"]
for detection in pkg_detections
],
)
if not use_referenced_license_expression(
referenced_license_expression=referenced_license_expression,
license_detection=license_detection_object,
):
continue

for pkg_detection in pkg_detections:
modified = True
detection_modified = True
Expand Down