Skip to content

Commit

Permalink
Merge pull request #262 from nf-core/combgc_new_input_param
Browse files Browse the repository at this point in the history
Combgc new input param
  • Loading branch information
jasmezz committed Apr 21, 2023
2 parents c955fac + a94a363 commit a0021ee
Show file tree
Hide file tree
Showing 2 changed files with 57 additions and 8 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- [#251](https://github.com/nf-core/funcscan/pull/251) Added annotation tool: Pyrodigal. (by @jasmezz)
- [#252](https://github.com/nf-core/funcscan/pull/252) Added a new parameter `-arg_rgi_savejson` that saves the file `<samplename>.json` in the RGI directory. The default ouput for RGI is now only `<samplename>.txt`. (by @darcy220606)
- [#253](https://github.com/nf-core/funcscan/pull/253) Updated Prodigal to have compressed output files. (by @jasmezz)
- [#262](https://github.com/nf-core/funcscan/pull/262) Added comBGC function to screen whole directory of antiSMASH output (one subfolder per sample). (by @jasmezz)

### `Fixed`

Expand Down
64 changes: 56 additions & 8 deletions bin/comBGC.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@
SOFTWARE.
"""

tool_version = "0.5"
tool_version = "0.6.0"
welcome = """\
........................
* comBGC v.{version} *
Expand Down Expand Up @@ -61,7 +61,9 @@
these can be:
- antiSMASH: <sample name>.gbk and (optional) knownclusterblast/ directory
- DeepBGC: <sample name>.bgc.tsv
- GECCO: <sample name>.clusters.tsv""",
- GECCO: <sample name>.clusters.tsv
Note: Please provide files from a single sample only. If you would like to
summarize multiple samples, please see the --antismash_multiple_samples flag.""",
)
parser.add_argument(
"-o",
Expand All @@ -73,6 +75,16 @@
type=str,
default=".",
)
parser.add_argument(
"-a",
"--antismash_multiple_samples",
metavar="PATH",
dest="antismash_multiple_samples",
nargs="?",
help="""directory of antiSMASH output. Should contain subfolders (one per
sample). Can only be used if --input is not specified.""",
type=str,
)
parser.add_argument("-vv", "--verbose", help="increase output verbosity", action="store_true")
parser.add_argument("-v", "--version", help="show version number and exit", action="store_true")

Expand All @@ -81,6 +93,7 @@

# Assign input arguments to variables
input = args.input
dir_antismash = args.antismash_multiple_samples
outdir = args.outdir
verbose = args.verbose
version = args.version
Expand Down Expand Up @@ -111,15 +124,38 @@
elif path.endswith("knownclusterblast/"):
input_antismash.append(path)

if input and dir_antismash:
exit(
"The flags --input and --antismash_multiple_samples are mutually exclusive.\nPlease use only one of them (or see --help for how to use)."
)

# Make sure that at least one input argument is given
if not (input_antismash or input_gecco or input_deepbgc):
if not (input_antismash or input_gecco or input_deepbgc or dir_antismash):
exit("Please specify at least one input file (i.e. output from antismash, deepbgc, or gecco) or see --help")

########################
# ANTISMASH FUNCTIONS
########################


def prepare_multisample_input_antismash(antismash_dir):
"""
Prepare string of input paths of a given antiSMASH output folder (with sample subdirectories)
"""
sample_paths = []
for root, subdirs, files in os.walk(antismash_dir):
antismash_file = "/".join([root, "index.html"])
if os.path.exists(antismash_file):
sample = root.split("/")[-1]
gbk_path = "/".join([root, sample]) + ".gbk"
kkb_path = "/".join([root, "knownclusterblast"])
if os.path.exists(kkb_path):
sample_paths.append([gbk_path, kkb_path])
else:
sample_paths.append([gbk_path])
return sample_paths


def parse_knownclusterblast(kcb_file_path):
"""
Extract MIBiG IDs from knownclusterblast TXT file.
Expand Down Expand Up @@ -148,9 +184,6 @@ def antismash_workflow(antismash_paths):
- Return data frame with aggregated info.
"""

if verbose:
print("\nParsing antiSMASH files\n... ", end="")

antismash_sum_cols = [
"Sample_ID",
"Prediction_tool",
Expand Down Expand Up @@ -186,6 +219,9 @@ def antismash_workflow(antismash_paths):

# Aggregate information
Sample_ID = gbk_path.split("/")[-1].split(".gbk")[-2] # Assuming file name equals sample name
if verbose:
print("\nParsing antiSMASH file(s): " + Sample_ID + "\n... ", end="")

with open(gbk_path) as gbk:
for record in SeqIO.parse(gbk, "genbank"): # GBK records are contigs in this case
# Initiate variables per contig
Expand Down Expand Up @@ -514,7 +550,13 @@ def gecco_workflow(gecco_paths):
########################

if __name__ == "__main__":
tools = {"antiSMASH": input_antismash, "deepBGC": input_deepbgc, "GECCO": input_gecco}
if input_antismash:
tools = {"antiSMASH": input_antismash, "deepBGC": input_deepbgc, "GECCO": input_gecco}
elif dir_antismash:
tools = {"antiSMASH": dir_antismash}
else:
tools = {"deepBGC": input_deepbgc, "GECCO": input_gecco}

tools_provided = {}

for tool in tools.keys():
Expand All @@ -532,7 +574,13 @@ def gecco_workflow(gecco_paths):

for tool in tools_provided.keys():
if tool == "antiSMASH":
summary_antismash = antismash_workflow(input_antismash)
if dir_antismash:
antismash_paths = prepare_multisample_input_antismash(dir_antismash)
for input_antismash in antismash_paths:
summary_antismash_temp = antismash_workflow(input_antismash)
summary_antismash = pd.concat([summary_antismash, summary_antismash_temp])
else:
summary_antismash = antismash_workflow(input_antismash)
elif tool == "deepBGC":
summary_deepbgc = deepbgc_workflow(input_deepbgc)
elif tool == "GECCO":
Expand Down

0 comments on commit a0021ee

Please sign in to comment.