Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Shell quote user-provided subsampling options #885

Merged
merged 1 commit into from May 3, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
23 changes: 18 additions & 5 deletions workflow/snakemake_rules/common.smk
Expand Up @@ -2,8 +2,24 @@
"""
import datetime
from itertools import product
from shlex import (
quote as shquote, # shquote() is used in this file and also other workflow files
split as shsplitwords,
)
from urllib.parse import urlsplit

def shquotewords(s: str) -> str:
"""
Split string *s* into (POSIX) shell words, quote each word, and join them
back into a string.

This is suitable for properly quoting multi-word, user-defined values which
should follow shell quoting and escaping semantics (e.g. to allow spaces in
single words) but not allow shell features like variable interpolation,
command substition, redirection, piping, etc.
"""
return " ".join(shquote(word) for word in shsplitwords(s))

def numeric_date(dt=None):
"""
Convert datetime object to the numeric date.
Expand Down Expand Up @@ -74,15 +90,12 @@ def _get_filter_min_length_query(wildcards):
# as recommended by pandas:
#
# https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.query.html
#
# We escape the backticks with backslashes to prevent the bash shell
# from expanding the contents between the backticks as a subprocess.
query_parts.append(f"(\`{input_name}\` == 'yes' & _length >= {min_length})")
query_parts.append(f"(`{input_name}` == 'yes' & _length >= {min_length})")
else:
query_parts.append(f"(_length >= {min_length})")

query = " | ".join(query_parts)
return f"--query \"{query}\""
return f"--query {shquote(query)}"

def _get_filter_value(wildcards, key):
for input_name in config["inputs"].keys():
Expand Down
8 changes: 4 additions & 4 deletions workflow/snakemake_rules/main_workflow.smk
Expand Up @@ -160,9 +160,9 @@ def get_priority_argument(wildcards):
return ""

if subsampling_settings["priorities"]["type"] == "proximity":
return "--priority " + get_priorities(wildcards)
return "--priority " + shquote(get_priorities(wildcards))
elif subsampling_settings["priorities"]["type"] == "file" and "file" in subsampling_settings["priorities"]:
return "--priority " + subsampling_settings["priorities"]["file"]
return "--priority " + shquote(subsampling_settings["priorities"]["file"])
else:
return ""

Expand Down Expand Up @@ -199,15 +199,15 @@ def _get_specific_subsampling_setting(setting, optional=False):
elif setting == 'max_sequences':
value = f"--subsample-max-sequences {value}"

return value
return shquotewords(value)
else:
value = ""

# Check format strings that haven't been resolved.
if re.search(r'\{.+\}', value):
raise Exception(f"The parameters for the subsampling scheme '{wildcards.subsample}' of build '{wildcards.build_name}' reference build attributes that are not defined in the configuration file: '{value}'. Add these build attributes to the appropriate configuration file and try again.")

return value
return shquotewords(value)

return _get_setting

Expand Down