Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add AlpinePackages pipeline #272

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
107 changes: 107 additions & 0 deletions scanpipe/pipelines/alpine_packages.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
# SPDX-License-Identifier: Apache-2.0
#
# http://nexb.com and https://github.com/nexB/scancode.io
# The ScanCode.io software is licensed under the Apache License version 2.0.
# Data generated with ScanCode.io is provided as-is without warranties.
# ScanCode is a trademark of nexB Inc.
#
# You may not use this software except in compliance with the License.
# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software distributed
# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
# CONDITIONS OF ANY KIND, either express or implied. See the License for the
# specific language governing permissions and limitations under the License.
#
# Data Generated with ScanCode.io is provided on an "AS IS" BASIS, WITHOUT WARRANTIES
# OR CONDITIONS OF ANY KIND, either express or implied. No content created from
# ScanCode.io should be considered or used as legal advice. Consult an Attorney
# for any legal advice.
#
# ScanCode.io is a free software code scanning tool from nexB Inc. and others.
# Visit https://github.com/nexB/scancode.io for support and download.

from scanpipe.pipelines import Pipeline
from scanpipe.pipes.alpine import download_or_checkout_aports
from scanpipe.pipes.alpine import extract_summary_fields
from scanpipe.pipes.alpine import get_unscanned_packages_from_db
from scanpipe.pipes.alpine import prepare_scan_dir
from scanpipe.pipes.scancode import run_extractcode
from scanpipe.pipes.scancode import run_scancode


class AlpinePackages(Pipeline):
"""
A pipeline to complement missing alpine package data.
quepop marked this conversation as resolved.
Show resolved Hide resolved
Downloads and extracts needed information from aports repository and package source files.
Alpine Linux does not provide copyrights and (in some cases) licenses for it's packages.
"""

@classmethod
def steps(cls):
return (
cls.create_alpine_versions_dict,
cls.download_aports_repo,
cls.complement_missing_package_data,
)

scancode_options = ["--copyright", "--summary"]

def create_alpine_versions_dict(self):
"""
Create a dict mapping alpine image ids from the database to alpine versions.
"""
self.alpine_versions = {
i["image_id"]: i["distro"]["version_id"]
for i in self.project.extra_data["images"]
if i["distro"]["identifier"] == "alpine"
quepop marked this conversation as resolved.
Show resolved Hide resolved
}

def download_aports_repo(self):
"""
Set pipeline's `aports_dir_path` variable to it's project temporary path.
Iterate over every alpine version associated with this project.
Download corresponding aports repository branches (alpine versions).
"""
self.aports_dir_path = self.project.tmp_path
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is the self.aports_dir_path variable really needed?

for image_id, alpine_version in self.alpine_versions.items():
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since image_id is not used, I would suggest:
for alpine_version in self.alpine_versions.values()

Copy link
Author

@quepop quepop Sep 23, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Using items() was @pombredanne 's commit suggestion.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@quepop values() would be better since you do not use the image_id variable.

download_or_checkout_aports(
aports_dir_path=self.project.tmp_path, alpine_version=alpine_version
)

def complement_missing_package_data(self):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The following code should be made more digest and readable.

"""
Iterate over alpine packages associated with this project.
Checkout aports repository to the corresponding alpine version and a commit.
Prepare scan target directory - download and extract package's sources.
Run scancode and extract missing data (only copyrights for now).
Update and save package's missing data to database.
"""
for (
alpine_version,
commit_id,
scan_target_path,
scan_result_path,
package,
) in get_unscanned_packages_from_db(
project=self.project, alpine_versions=self.alpine_versions
):
Comment on lines +85 to +87
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In general, when the name of the keyword argument and the provided variable is the same, it's explicit enough to only keep the variable.

For example:

get_unscanned_packages_from_db(project=self.project, alpine_versions=self.alpine_versions)

I think the following is as explicit and more readable:

get_unscanned_packages_from_db(self.project, self.alpine_versions)

It make sense to keep the keyword agrs in the following example though:

run_scancode(
    location=str(scan_target_path),
    output_file=str(scan_result_path),
    options=self.scancode_options,
)

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I used unnamed positional arguments before and @pombredanne commented that i should use named positionals everywhere.

Here and in general, do you mind to use named keyword arguments rather than un-named positional arguments? This makes reading much easier and is more resistant to refactorings that adds or reorders arguments

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I disagree with the "makes reading much easier" in the cases mentioned above but "more resistant to refactorings" may be a fair point.
You can leave it as-is then ;)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You could store that call in a unscanned_packages to help with the for loop layout.

if not download_or_checkout_aports(
aports_dir_path=self.aports_dir_path,
alpine_version=alpine_version,
commit_id=commit_id,
) or not prepare_scan_dir(
package_name=package.name, scan_target_path=scan_target_path
):
continue
run_extractcode(location=str(scan_target_path))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This run_extractcode function does not exists in the main branch anymore since b035f00. You need to migrate to the new scancode.extract_archives API.
See https://github.com/nexB/scancode.io/blob/main/scanpipe/pipelines/scan_codebase.py#L73 for an example.

run_scancode(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would suggest to call directly the ScanCode scancode.api.get_copyrights function instead of starting a full scancode subprocess.
This will be more efficient and will remove the need for extract_summary_fields.

location=str(scan_target_path),
output_file=str(scan_result_path),
options=self.scancode_options,
)
package.update_extra_data(
data=extract_summary_fields(
scan_result_path=scan_result_path,
summary_field_names=["copyrights"],
)
)
106 changes: 106 additions & 0 deletions scanpipe/pipes/alpine.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,114 @@
# ScanCode.io is a free software code scanning tool from nexB Inc. and others.
# Visit https://github.com/nexB/scancode.io for support and download.


import json
from shutil import copytree

from fetchcode import fetch
from fetchcode.vcs.git import fetch_via_git
from packagedcode import alpine

from scanpipe.models import DiscoveredPackage

APORTS_URL = "https://gitlab.alpinelinux.org/alpine/aports.git"
APORTS_DIR_NAME = "aports"
APORTS_SUBDIRS = ["main", "non-free", "testing", "community", "unmaintained"]


def download_or_checkout_aports(aports_dir_path, alpine_version, commit_id=None):
"""
Download aports repository and it's branch based on `alpine_version`.
Checkout to a branch (alpine version).
If `commit_id` is provided also checkout to a commit.
Return `aports_dir_path` if checkout(s) succeded. #TODO Proper fetchcode patch required (extending #54)
"""
major, minor = alpine_version.split(".")[:2]
aports_dir_path = str(aports_dir_path / APORTS_DIR_NAME)
fetch_via_git(
url=f"git+{APORTS_URL}@{major}.{minor}-stable", location=aports_dir_path
)
if commit_id:
fetch_via_git(url=f"git+{APORTS_URL}@{commit_id}", location=aports_dir_path)
return aports_dir_path


def get_unscanned_packages_from_db(project, alpine_versions):
"""
Return an iterator of 5-tuples (alpine_version, commit_id, scan_target_path, scan_result_path, package) where:
`alpine_version` is an alpine version from which a package comes from (obtained from `alpine_versions` dict),
`commit_id` is an id of aports repository commit that added corresponding version of a package,
`scan_target_path` is a path of the directory on which a scan will be performed,
`scan_result_path` is a path of the scan result json file,
`package` is a DiscoveredPackage instance that belongs to a `project` with an alpine package type.
The returned iterator contains not-a-subpackage alpine packages that don't have an existing scan result file.
"""
for package in DiscoveredPackage.objects.filter(project=project, type="alpine"):
scan_id = f"{package.name}_{package.version}"
scan_result_path = project.output_path / (scan_id + ".json")
alpine_version = alpine_versions.get(package.extra_data["image_id"])
commit_id = package.vcs_url.split("id=")[1]
scan_target_path = project.tmp_path / scan_id
not_a_subpackage = (
not package.source_packages or package.source_packages[0] in package.purl
)
scan_result_nonexistent = not scan_result_path.exists()
if not_a_subpackage and scan_result_nonexistent:
yield alpine_version, commit_id, scan_target_path, scan_result_path, package


def prepare_scan_dir(package_name, scan_target_path, aports_dir_path=None):
"""
A function to gather all the package's source files in `scan_target_path`.
Source files of an alpine package are obtained from it's aports directory whose location has to be guessed.
Such directory is present in one of the five aports repository subdirectories (main, non-free, testing, community, unmaintained).
It's name is the same as the value of the corresponding package's `name` field (hence the `package_name` parameter).
Here are some path examples:
.../aports/main/acf-db
.../aports/non-free/mongodb
Inside, there are some extra files (patches) and an APKBUILD which contains urls to source tarballs.
The function copies all these files (including APKBUILD) and downloads all the source tarballs to `scan_target_path`.
The default value of `aports_dir_path` is set to the parent of the `scan_target_path`.
If the package's aports path is found/guessed and it's also not empty the returned value is `scan_target_path`.
"""
if aports_dir_path is None:
aports_dir_path = scan_target_path.parent
for subdir_name in APORTS_SUBDIRS:
quepop marked this conversation as resolved.
Show resolved Hide resolved
apkbuild_dir = aports_dir_path / APORTS_DIR_NAME / subdir_name / package_name
if not apkbuild_dir.exists():
continue
if not any(apkbuild_dir.iterdir()):
break
copytree(apkbuild_dir, scan_target_path)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why do you need a copy?

Copy link
Author

@quepop quepop Aug 10, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Because we have to download and extract the sources somewhere and it shouldn't be inside of the aports repo directory. Furthermore doing a scan on a single directory is in my opinion much better. If we were to do two separate scans, path handling and scan result merging would make the code much less clean. Also we have the files in one place for further investigation (if something is wrong with the package) and it simply would not be possible if we didn't copy them because we do checkouts in a loop.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OK, so if I understand correctly you are:

  1. making a copy of the aports directory of a given package (which would typically include the APKBUILD and some patches)
  2. in this copied directory, you will also fetch the sources (or at least only the remote sources as identified by a URL)
  3. finally you will (extract then run a scan of sorts on this directory? )

I think it could be better if you separate each operation and the process could benefit from more documentation.

Copy link
Author

@quepop quepop Aug 12, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OK, so if I understand correctly you are:

Yes, exactly.

I think it could be better if you separate each operation and the process could benefit from more documentation.

If i separated 1. from 2. it would have only copytree(apkbuild_dir, scan_target_path) as the function body. Also, 3. is already separate from the rest.

Copy link
Author

@quepop quepop Aug 31, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So do you want me to split it? Given my reasons above I don't think we should. Also when we look at the rest of the scancode.io source files there are many instances where much bigger functions than mine aren't split into smaller ones.

package_sources = (
alpine.parse_apkbuild(scan_target_path / "APKBUILD")
quepop marked this conversation as resolved.
Show resolved Hide resolved
.to_dict()
.get("extra_data")
.get("sources")
or []
)
for source in package_sources:
source_url = source.get("url")
if source_url:
fetch(source_url, scan_target_path)
return scan_target_path


def extract_summary_fields(scan_result_path, summary_field_names):
"""
Having a scancode result file extract all the values from the `summary` section of the scan result file (`scan_result_path`).
Put them in the arrays inside the `result` object (result[`field_name`]).
Return `result`.
"""
scan_result = open(scan_result_path)
summaries = json.load(scan_result)["summary"]
scan_result.close()
result = {}
for field_name in summary_field_names:
values = (summary["value"] for summary in summaries.get(field_name, []))
result[field_name] = [v for v in values if v]
return result


def package_getter(root_dir, **kwargs):
"""
Expand Down
1 change: 1 addition & 0 deletions scanpipe/pipes/docker.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,7 @@ def scan_image_for_system_packages(project, image, detect_licenses=True):

for i, (purl, package, layer) in enumerate(installed_packages):
logger.info(f"Creating package #{i}: {purl}")
package.extra_data = {"image_id": image.image_id}
created_package = pipes.update_or_create_package(project, package.to_dict())

# We have no files for this installed package, we cannot go further.
Expand Down
Empty file.
Empty file.
Empty file.
Empty file.
34 changes: 34 additions & 0 deletions scanpipe/tests/data/example_scan_summary.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
{
"summary": {
"copyrights": [
{
"value": "Copyright (c) A B",
"count": 51
},
{
"value": "Copyright (c) C D",
"count": 8
}
],
"holders": [
{
"value": "A B",
"count": 51
},
{
"value": "C D",
"count": 41
}
],
"authors": [
{
"value": "A B",
"count": 2
},
{
"value": "C D",
"count": 1
}
]
}
}
97 changes: 97 additions & 0 deletions scanpipe/tests/test_pipes.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
from scanpipe.models import CodebaseResource
from scanpipe.models import DiscoveredPackage
from scanpipe.models import Project
from scanpipe.pipes import alpine
from scanpipe.pipes import codebase
from scanpipe.pipes import docker
from scanpipe.pipes import fetch
Expand Down Expand Up @@ -756,6 +757,102 @@ def test_scanpipe_pipes_rootfs_has_hash_diff(self):
codebase_resource = CodebaseResource(sha256="sha256", md5="md5")
self.assertFalse(rootfs.has_hash_diff(install_file, codebase_resource))

@mock.patch("scanpipe.pipes.alpine.fetch_via_git")
def test_scanpipe_pipes_alpine_download_or_checkout_aports(self, fetch_via_git):
example_path = Path()
aports_path = str(example_path / alpine.APORTS_DIR_NAME)

alpine.download_or_checkout_aports(
aports_dir_path=example_path, alpine_version="3.13.14"
)
fetch_via_git.assert_called_with(
url=f"git+{alpine.APORTS_URL}@3.13-stable", location=aports_path
)

alpine.download_or_checkout_aports(
aports_dir_path=example_path, alpine_version="3.13.14", commit_id="1"
)
fetch_via_git.assert_called_with(
url=f"git+{alpine.APORTS_URL}@1", location=aports_path
)

def test_scanpipe_pipes_alpine_get_unscanned_packages_from_db(self):
project = Project.objects.create(name="example")
alpine_versions = {"1": "3.12", "2": "3.13"}
package_field_names = (
"type",
"name",
"version",
"vcs_url",
"source_packages",
"extra_data",
)
package_data = [
("debian",),
("rpm",),
("alpine", "A", "1.0", "id=A", [], {"image_id": "1"}),
("alpine", "B", "1.0", "id=B", [], {"image_id": "2"}),
]
# The test will get bigger (thus arrays and loops instead of consecutive function calls) - futher patches for this function expected
expected_package_tuples = [
(
"3.13",
"B",
project.tmp_path / "B_1.0",
project.output_path / "B_1.0.json",
),
]
(project.output_path / "A_1.0.json").touch()
for package_data_tuple in package_data:
DiscoveredPackage.objects.create(
project=project, **dict(zip(package_field_names, package_data_tuple))
)
yielded_package_tuples = alpine.get_unscanned_packages_from_db(
project=project, alpine_versions=alpine_versions
)
for i, package_tuple in enumerate(yielded_package_tuples):
self.assertEqual(expected_package_tuples[i], package_tuple[:4])

@mock.patch("scanpipe.pipes.alpine.alpine.parse_apkbuild")
@mock.patch("scanpipe.pipes.alpine.copytree")
def test_scanpipe_pipes_alpine_prepare_scan_dir(self, copytree, parse_apkbuild):
example_path = Path()

aports_path = self.data_location / alpine.APORTS_DIR_NAME
(aports_path / "main" / "A").mkdir(parents=True, exist_ok=True)
(aports_path / "non-free" / "A").mkdir(parents=True, exist_ok=True)
(aports_path / "community" / "B").mkdir(parents=True, exist_ok=True)

package_test_cases = [
("A", None),
("B", None),
("C", None),
("D", example_path),
("E", example_path),
]

for test_case in package_test_cases:
returned_value = alpine.prepare_scan_dir(
package_name=test_case[0],
scan_target_path=example_path,
aports_dir_path=self.data_location,
)
self.assertEqual(returned_value, test_case[1])

def test_scanpipe_pipes_alpine_extract_summary_fields(self):
returned_value = alpine.extract_summary_fields(
self.data_location / "example_scan_summary.json",
["copyrights", "holders", "authors"],
)
self.assertEqual(
returned_value,
{
"copyrights": ["Copyright (c) A B", "Copyright (c) C D"],
"holders": ["A B", "C D"],
"authors": ["A B", "C D"],
},
)


class ScanPipePipesTransactionTest(TransactionTestCase):
"""
Expand Down
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@
"root_filesystems = scanpipe.pipelines.root_filesystems:RootFS",
"scan_codebase = scanpipe.pipelines.scan_codebase:ScanCodebase",
"scan_package = scanpipe.pipelines.scan_package:ScanPackage",
"alpine_packages = scanpipe.pipelines.alpine_packages:AlpinePackages"
],
},
classifiers=[
Expand Down