From 01ae375dc8845cf8c5d6a30542934e9812a23eaf Mon Sep 17 00:00:00 2001 From: Keshav Priyadarshi Date: Tue, 9 Apr 2024 14:49:15 +0530 Subject: [PATCH 1/8] Add addon pipeline for string collection Signed-off-by: Keshav Priyadarshi --- scanpipe/pipelines/collect_source_strings.py | 42 ++++++++++++ scanpipe/pipes/source_strings.py | 67 ++++++++++++++++++++ setup.cfg | 1 + 3 files changed, 110 insertions(+) create mode 100644 scanpipe/pipelines/collect_source_strings.py create mode 100644 scanpipe/pipes/source_strings.py diff --git a/scanpipe/pipelines/collect_source_strings.py b/scanpipe/pipelines/collect_source_strings.py new file mode 100644 index 000000000..7f6da3bc4 --- /dev/null +++ b/scanpipe/pipelines/collect_source_strings.py @@ -0,0 +1,42 @@ +# SPDX-License-Identifier: Apache-2.0 +# +# http://nexb.com and https://github.com/nexB/scancode.io +# The ScanCode.io software is licensed under the Apache License version 2.0. +# Data generated with ScanCode.io is provided as-is without warranties. +# ScanCode is a trademark of nexB Inc. +# +# You may not use this software except in compliance with the License. +# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. +# +# Data Generated with ScanCode.io is provided on an "AS IS" BASIS, WITHOUT WARRANTIES +# OR CONDITIONS OF ANY KIND, either express or implied. No content created from +# ScanCode.io should be considered or used as legal advice. Consult an Attorney +# for any legal advice. +# +# ScanCode.io is a free software code scanning tool from nexB Inc. and others. +# Visit https://github.com/nexB/scancode.io for support and download. + +from scanpipe.pipelines import Pipeline +from scanpipe.pipes import source_strings + + +class CollectSourceStrings(Pipeline): + """Collect source strings from codebase files and keep them in extra data field.""" + + download_inputs = False + is_addon = True + + @classmethod + def steps(cls): + return (cls.collect_and_store_resource_strings,) + + def collect_and_store_resource_strings(self): + """ + Collect source strings from codebase files using gettext and store + them in the extra data field. + """ + source_strings.collect_and_store_resource_strings(self.project, self.log) diff --git a/scanpipe/pipes/source_strings.py b/scanpipe/pipes/source_strings.py new file mode 100644 index 000000000..3e4813e64 --- /dev/null +++ b/scanpipe/pipes/source_strings.py @@ -0,0 +1,67 @@ +# SPDX-License-Identifier: Apache-2.0 +# +# http://nexb.com and https://github.com/nexB/scancode.io +# The ScanCode.io software is licensed under the Apache License version 2.0. +# Data generated with ScanCode.io is provided as-is without warranties. +# ScanCode is a trademark of nexB Inc. +# +# You may not use this software except in compliance with the License. +# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. +# +# Data Generated with ScanCode.io is provided on an "AS IS" BASIS, WITHOUT WARRANTIES +# OR CONDITIONS OF ANY KIND, either express or implied. No content created from +# ScanCode.io should be considered or used as legal advice. Consult an Attorney +# for any legal advice. +# +# ScanCode.io is a free software code scanning tool from nexB Inc. and others. +# Visit https://github.com/nexB/scancode.io for support and download. + +from source_inspector import strings_xgettext + +from scanpipe.pipes import LoopProgress + + +class XgettextNotFound(Exception): + pass + + +def collect_and_store_resource_strings(project, logger=None): + """ + Collect source strings from codebase files using xgettext and store + them in the extra data field. + """ + if not strings_xgettext.is_xgettext_installed(): + raise XgettextNotFound( + "``xgettext`` not found." + "Install ``gettext`` to use this pipeline." + ) + + project_files = project.codebaseresources.files() + + resources = project_files.filter( + is_binary=False, + is_archive=False, + is_media=False, + ) + + resources_count = resources.count() + + resource_iterator = resources.iterator(chunk_size=2000) + progress = LoopProgress(resources_count, logger) + + for resource in progress.iter(resource_iterator): + _collect_and_store_resource_strings(resource) + + +def _collect_and_store_resource_strings(resource): + """ + Collect strings from a resource using xgettext and store + them in the extra data field. + """ + result = strings_xgettext.collect_strings(resource.location) + strings = [item["string"] for item in result if "string" in item] + resource.update_extra_data({"source_strings": strings}) diff --git a/setup.cfg b/setup.cfg index 4893df5b9..13eff13d9 100644 --- a/setup.cfg +++ b/setup.cfg @@ -131,6 +131,7 @@ scancodeio_pipelines = analyze_docker_image = scanpipe.pipelines.docker:Docker analyze_root_filesystem_or_vm_image = scanpipe.pipelines.root_filesystem:RootFS analyze_windows_docker_image = scanpipe.pipelines.docker_windows:DockerWindows + collect_source_strings = scanpipe.pipelines.collect_source_strings:CollectSourceStrings collect_symbols = scanpipe.pipelines.collect_symbols:CollectSymbols find_vulnerabilities = scanpipe.pipelines.find_vulnerabilities:FindVulnerabilities inspect_elf_binaries = scanpipe.pipelines.inspect_elf_binaries:InspectELFBinaries From c5e8e43b32ce021ce4ce7e34f5ffff65dbce5300 Mon Sep 17 00:00:00 2001 From: Keshav Priyadarshi Date: Tue, 9 Apr 2024 16:15:25 +0530 Subject: [PATCH 2/8] Add test for collect_source_strings pipeline Signed-off-by: Keshav Priyadarshi --- scanpipe/pipes/source_strings.py | 3 +- scanpipe/tests/pipes/test_source_strings.py | 60 +++++++++++++++++++++ scanpipe/tests/test_pipelines.py | 27 ++++++++++ 3 files changed, 88 insertions(+), 2 deletions(-) create mode 100644 scanpipe/tests/pipes/test_source_strings.py diff --git a/scanpipe/pipes/source_strings.py b/scanpipe/pipes/source_strings.py index 3e4813e64..bca608501 100644 --- a/scanpipe/pipes/source_strings.py +++ b/scanpipe/pipes/source_strings.py @@ -36,8 +36,7 @@ def collect_and_store_resource_strings(project, logger=None): """ if not strings_xgettext.is_xgettext_installed(): raise XgettextNotFound( - "``xgettext`` not found." - "Install ``gettext`` to use this pipeline." + "``xgettext`` not found. Install ``gettext`` to use this pipeline." ) project_files = project.codebaseresources.files() diff --git a/scanpipe/tests/pipes/test_source_strings.py b/scanpipe/tests/pipes/test_source_strings.py new file mode 100644 index 000000000..f7e348da8 --- /dev/null +++ b/scanpipe/tests/pipes/test_source_strings.py @@ -0,0 +1,60 @@ +# SPDX-License-Identifier: Apache-2.0 +# +# http://nexb.com and https://github.com/nexB/scancode.io +# The ScanCode.io software is licensed under the Apache License version 2.0. +# Data generated with ScanCode.io is provided as-is without warranties. +# ScanCode is a trademark of nexB Inc. +# +# You may not use this software except in compliance with the License. +# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. +# +# Data Generated with ScanCode.io is provided on an "AS IS" BASIS, WITHOUT WARRANTIES +# OR CONDITIONS OF ANY KIND, either express or implied. No content created from +# ScanCode.io should be considered or used as legal advice. Consult an Attorney +# for any legal advice. +# +# ScanCode.io is a free software code scanning tool from nexB Inc. and others. +# Visit https://github.com/nexB/scancode.io for support and download. + +import sys +from pathlib import Path +from unittest import skipIf + +from django.test import TestCase + +from scanpipe import pipes +from scanpipe.models import Project +from scanpipe.pipes import source_strings +from scanpipe.pipes.input import copy_input + + +class ScanPipeSourceStringsPipesTest(TestCase): + data_location = Path(__file__).parent.parent / "data" + + def setUp(self): + self.project1 = Project.objects.create(name="Analysis") + + @skipIf(sys.platform == "darwin", "Not supported on macOS") + def test_scanpipe_pipes_symbols_collect_and_store_resource_strings(self): + dir = self.project1.codebase_path / "codefile" + dir.mkdir(parents=True) + + file_location = self.data_location / "d2d-javascript" / "from" / "main.js" + copy_input(file_location, dir) + + pipes.collect_and_create_codebase_resources(self.project1) + + source_strings.collect_and_store_resource_strings(self.project1) + + main_file = self.project1.codebaseresources.files()[0] + result_extra_data_strings = main_file.extra_data.get("source_strings") + + expected_extra_data_strings = [ + "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ1234567890!@#$%^&*()_-+=", # noqa + "Enter the desired length of your password:", + ] + self.assertCountEqual(expected_extra_data_strings, result_extra_data_strings) diff --git a/scanpipe/tests/test_pipelines.py b/scanpipe/tests/test_pipelines.py index 61cf6fdff..d7e4a4f01 100644 --- a/scanpipe/tests/test_pipelines.py +++ b/scanpipe/tests/test_pipelines.py @@ -1240,3 +1240,30 @@ def test_scanpipe_collect_symbols_pipeline_integration(self): result_extra_data_symbols = main_file.extra_data.get("source_symbols") expected_extra_data_symbols = ["generatePassword", "passwordLength", "charSet"] self.assertCountEqual(expected_extra_data_symbols, result_extra_data_symbols) + + @skipIf(sys.platform == "darwin", "Not supported on macOS") + def test_scanpipe_collect_source_strings_pipeline_integration(self): + pipeline_name = "collect_source_strings" + project1 = Project.objects.create(name="Analysis") + + dir = project1.codebase_path / "codefile" + dir.mkdir(parents=True) + + file_location = self.data_location / "d2d-javascript" / "from" / "main.js" + copy_input(file_location, dir) + + pipes.collect_and_create_codebase_resources(project1) + + run = project1.add_pipeline(pipeline_name) + pipeline = run.make_pipeline_instance() + + exitcode, out = pipeline.execute() + self.assertEqual(0, exitcode, msg=out) + + main_file = project1.codebaseresources.files()[0] + result_extra_data_strings = main_file.extra_data.get("source_strings") + expected_extra_data_strings = [ + "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ1234567890!@#$%^&*()_-+=", # noqa + "Enter the desired length of your password:", + ] + self.assertCountEqual(expected_extra_data_strings, result_extra_data_strings) From 8d6d8b928611cd3eed718aaf6860b0e36aa88806 Mon Sep 17 00:00:00 2001 From: Keshav Priyadarshi Date: Tue, 9 Apr 2024 16:19:39 +0530 Subject: [PATCH 3/8] Update dockerfile to install xgettext Signed-off-by: Keshav Priyadarshi --- Dockerfile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index e6bb0a6fd..34d2420d8 100644 --- a/Dockerfile +++ b/Dockerfile @@ -40,7 +40,7 @@ ENV PYTHONPATH $PYTHONPATH:$APP_DIR # OS requirements as per # https://scancode-toolkit.readthedocs.io/en/latest/getting-started/install.html -# Also install universal-ctags for symbol collection. +# Also install universal-ctags and xgettext for symbol and string collection. RUN apt-get update \ && apt-get install -y --no-install-recommends \ bzip2 \ @@ -60,6 +60,7 @@ RUN apt-get update \ git \ wait-for-it \ universal-ctags \ + gettext \ && apt-get clean \ && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* From ac1a8322d8ee65691581dfe39e6f1e04ee80bf6e Mon Sep 17 00:00:00 2001 From: Keshav Priyadarshi Date: Tue, 9 Apr 2024 16:24:28 +0530 Subject: [PATCH 4/8] Update CI to install xgettext Signed-off-by: Keshav Priyadarshi --- .github/workflows/ci.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 28623de03..3151a6741 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -44,6 +44,9 @@ jobs: - name: Install universal ctags run: sudo apt-get install -y universal-ctags + + - name: Install xgettext + run: sudo apt-get install -y gettext - name: Install dependencies run: make dev envfile From 453bf335773539be888d949055844ff6b2cdb338 Mon Sep 17 00:00:00 2001 From: Keshav Priyadarshi Date: Tue, 9 Apr 2024 16:37:25 +0530 Subject: [PATCH 5/8] Update docs Signed-off-by: Keshav Priyadarshi --- docs/built-in-pipelines.rst | 8 ++++++++ docs/installation.rst | 19 +++++++++++++++---- 2 files changed, 23 insertions(+), 4 deletions(-) diff --git a/docs/built-in-pipelines.rst b/docs/built-in-pipelines.rst index ac47cdade..f6307f06a 100644 --- a/docs/built-in-pipelines.rst +++ b/docs/built-in-pipelines.rst @@ -42,6 +42,14 @@ Analyse Docker Windows Image :members: :member-order: bysource +.. _pipeline_collect_source_strings: + +Collect Source Strings (addon) +-------------------------------- +.. autoclass:: scanpipe.pipelines.collect_source_strings.CollectSourceStrings() + :members: + :member-order: bysource + .. _pipeline_collect_symbols: Collect Codebase Symbols (addon) diff --git a/docs/installation.rst b/docs/installation.rst index d9c5c51a3..f55ee0610 100644 --- a/docs/installation.rst +++ b/docs/installation.rst @@ -261,13 +261,24 @@ See also `ScanCode-toolkit Prerequisites `_ for more details. For the :ref:`pipeline_collect_symbols` pipeline, `Universal Ctags `_ is needed. -On **Linux** install it using:: - sudo apt-get install universal-ctags + * On **Linux** install it using:: -On **MacOS** install Universal Ctags using Homebrew:: + sudo apt-get install universal-ctags - brew install universal-ctags + * On **MacOS** install Universal Ctags using Homebrew:: + + brew install universal-ctags + +For the :ref:`pipeline_collect_source_strings` pipeline, `gettext `_ is needed. + + * On **Linux** install it using:: + + sudo apt-get install gettext + + * On **MacOS** install gettext using Homebrew:: + + brew install gettext Clone and Configure ^^^^^^^^^^^^^^^^^^^ From 64d7f789491841cd2b66146255ce7c9a381e8c10 Mon Sep 17 00:00:00 2001 From: Keshav Priyadarshi Date: Wed, 10 Apr 2024 17:30:29 +0530 Subject: [PATCH 6/8] Only supported on Linux Signed-off-by: Keshav Priyadarshi Co-authored-by: Philippe Ombredanne --- scanpipe/tests/pipes/test_source_strings.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scanpipe/tests/pipes/test_source_strings.py b/scanpipe/tests/pipes/test_source_strings.py index f7e348da8..49e0a7716 100644 --- a/scanpipe/tests/pipes/test_source_strings.py +++ b/scanpipe/tests/pipes/test_source_strings.py @@ -38,7 +38,7 @@ class ScanPipeSourceStringsPipesTest(TestCase): def setUp(self): self.project1 = Project.objects.create(name="Analysis") - @skipIf(sys.platform == "darwin", "Not supported on macOS") + @skipIf(sys.platform != "linux", "Only supported on Linux") def test_scanpipe_pipes_symbols_collect_and_store_resource_strings(self): dir = self.project1.codebase_path / "codefile" dir.mkdir(parents=True) From 889f087e0733419bee884194d73e762f032e5f99 Mon Sep 17 00:00:00 2001 From: Keshav Priyadarshi Date: Wed, 10 Apr 2024 17:30:58 +0530 Subject: [PATCH 7/8] Only supported on Linux Signed-off-by: Keshav Priyadarshi Co-authored-by: Philippe Ombredanne --- scanpipe/tests/test_pipelines.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scanpipe/tests/test_pipelines.py b/scanpipe/tests/test_pipelines.py index d7e4a4f01..1506154ce 100644 --- a/scanpipe/tests/test_pipelines.py +++ b/scanpipe/tests/test_pipelines.py @@ -1241,7 +1241,7 @@ def test_scanpipe_collect_symbols_pipeline_integration(self): expected_extra_data_symbols = ["generatePassword", "passwordLength", "charSet"] self.assertCountEqual(expected_extra_data_symbols, result_extra_data_symbols) - @skipIf(sys.platform == "darwin", "Not supported on macOS") + @skipIf(sys.platform != "linux", "Only supported on Linux") def test_scanpipe_collect_source_strings_pipeline_integration(self): pipeline_name = "collect_source_strings" project1 = Project.objects.create(name="Analysis") From 72c4712d2eec2ba7ffb72d173b76c3294ad25262 Mon Sep 17 00:00:00 2001 From: Keshav Priyadarshi Date: Wed, 10 Apr 2024 17:38:22 +0530 Subject: [PATCH 8/8] Add CHANGELOG for CollectSourceStrings pipeline Signed-off-by: Keshav Priyadarshi --- CHANGELOG.rst | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 2f6e4cb60..b1f383f35 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -7,6 +7,10 @@ v34.3.0 (unreleased) - Associate resolved packages with their source codebase resource. https://github.com/nexB/scancode.io/issues/1140 +- Add a new `CollectSourceStrings` pipeline (addon) for collecting source string using + xgettext. + https://github.com/nexB/scancode.io/pull/1160 + v34.2.0 (2024-03-28) --------------------