diff --git a/metacoag b/metacoag index e4145ef..aea9716 100755 --- a/metacoag +++ b/metacoag @@ -1,6 +1,5 @@ #!/usr/bin/env python3 -import click import concurrent.futures import csv import gc @@ -12,11 +11,13 @@ import subprocess import sys import time +import click from Bio import SeqIO from igraph import * from tqdm import tqdm -from metacoag_utils import (feature_utils, graph_utils, label_prop_utils, marker_gene_utils, matching_utils) +from metacoag_utils import (feature_utils, graph_utils, label_prop_utils, + marker_gene_utils, matching_utils) from metacoag_utils.bidirectionalmap import BidirectionalMap __author__ = "Vijini Mallawaarachchi and Yu Lin" @@ -245,7 +246,7 @@ def main( # Validate prefix if prefix != None: if not prefix.endswith("_"): - prefix = prefix + "_" + prefix = f"{prefix}_" else: prefix = "" @@ -392,7 +393,7 @@ def main( # Add vertices assembly_graph.add_vertices(node_count) - logger.info("Total number of contigs available: " + str(node_count)) + logger.info(f"Total number of contigs available: {node_count}") # Name vertices with contig identifiers for i in range(node_count): @@ -429,10 +430,7 @@ def main( # Simplify the graph assembly_graph.simplify(multiple=True, loops=False, combine_edges=None) - logger.info( - "Total number of edges in the assembly graph: " - + str(len(list(assembly_graph.es))) - ) + logger.info(f"Total number of edges in the assembly graph: {len(list(assembly_graph.es))}") except: logger.error( @@ -454,7 +452,7 @@ def main( # Get isolated contigs with no neighbours isolated = graph_utils.get_isolated(node_count, assembly_graph) - logger.info("Total isolated contigs in the assembly graph: " + str(len(isolated))) + logger.info(f"Total isolated contigs in the assembly graph: {len(isolated)}") # Get the number of samples and the length and coverage of contigs # ------------------------------------------------------------------------ @@ -495,10 +493,8 @@ def main( if contig_lengths[contig] >= min_length: isolated_long.append(contig) - logger.info("Total long contigs: " + str(my_long)) - logger.info( - "Total isolated long contigs in the assembly graph: " + str(len(isolated_long)) - ) + logger.info(f"Total long contigs: {my_long}") + logger.info(f"Total isolated long contigs in the assembly graph: {len(isolated_long)}") # Set intra weight and inter weight # ------------------------------------------------------------------------ @@ -506,8 +502,8 @@ def main( w_intra = bin_threshold * (n_samples + 1) w_inter = break_threshold * (n_samples + 1) - logger.debug("w_intra: " + str(w_intra)) - logger.debug("w_inter: " + str(w_inter)) + logger.debug(f"w_intra: {w_intra}") + logger.debug("w_inter: {w_inter}") # Get tetramer composition of contigs # ------------------------------------------------------------------------ @@ -531,7 +527,7 @@ def main( logger.info("Scanning for single-copy marker genes") - if not os.path.exists(contigs_file + ".hmmout"): + if not os.path.exists(f"{contigs_file}.hmmout"): # Check if FragGeneScan is installed try: p = subprocess.run(["which", "run_FragGeneScan.pl"], capture_output=True) @@ -598,10 +594,7 @@ def main( mg_length_threshold=mg_threshold, ) - logger.info( - "Number of contigs containing single-copy marker genes: " - + str(len(contig_markers)) - ) + logger.info(f"Number of contigs containing single-copy marker genes: {len(contig_markers)}") # Check if there are contigs with single-copy marker genes if len(contig_markers) == 0: @@ -683,7 +676,7 @@ def main( bin_markers[i] = contig_markers[contig_num] - logger.debug("Number of initial bins detected: " + str(len(smg_iteration[0]))) + logger.debug(f"Number of initial bins detected: {len(smg_iteration[0])}") logger.debug("Initialised bins: ") logger.debug(bins) @@ -717,17 +710,14 @@ def main( d_limit=d_limit, ) - logger.debug("Number of bins after matching: " + str(len(bins))) + logger.debug(f"Number of bins after matching: {len(bins)}") logger.debug("Bins with contigs containing seed marker genes") for b in bins: - logger.debug(str(b) + ": " + str(bins[b])) + logger.debug(f"{b}: {bins[b]}") - logger.debug( - "Number of binned contigs with single-copy marker genes: " - + str(len(bin_of_contig)) - ) + logger.debug(f"Number of binned contigs with single-copy marker genes: {len(bin_of_contig)}") del smg_iteration del my_gene_counts @@ -757,10 +747,7 @@ def main( unbinned_mg_contig_lengths.items(), key=operator.itemgetter(1), reverse=True ) - logger.debug( - "Number of unbinned contigs with single-copy marker genes: " - + str(len(unbinned_mg_contigs)) - ) + logger.debug(f"Number of unbinned contigs with single-copy marker genes: {len(unbinned_mg_contigs)}") logger.info("Further assigning contigs with single-copy marker genes") @@ -790,13 +777,8 @@ def main( set(contig_markers.keys()) - set(binned_contigs_with_markers) ) - logger.debug( - "Remaining number of unbinned MG seed contigs: " + str(len(unbinned_mg_contigs)) - ) - logger.debug( - "Number of binned contigs with single-copy marker genes: " - + str(len(bin_of_contig)) - ) + logger.debug(f"Remaining number of unbinned MG seed contigs: {len(unbinned_mg_contigs)}") + logger.debug(f"Number of binned contigs with single-copy marker genes: {len(bin_of_contig)}") del unbinned_mg_contigs del unbinned_mg_contig_lengths @@ -830,12 +812,9 @@ def main( unbinned_contigs = list(set([x for x in range(node_count)]) - set(binned_contigs)) - logger.debug("Number of binned contigs: " + str(len(binned_contigs))) - logger.debug("Number of unbinned contigs: " + str(len(unbinned_contigs))) - logger.debug( - "Number of binned contigs with markers: " - + str(len(binned_contigs_with_markers)) - ) + logger.debug(f"Number of binned contigs: {len(binned_contigs)}") + logger.debug(f"Number of unbinned contigs: {len(unbinned_contigs)}") + logger.debug(f"Number of binned contigs with markers: {len(binned_contigs_with_markers)}") # Get components without labels # ----------------------------------------------------- @@ -848,7 +827,7 @@ def main( nthreads=nthreads ) - logger.debug("Number of non-isolated contigs: " + str(len(non_isolated))) + logger.debug(f"Number of non-isolated contigs: {len(non_isolated)}") # Propagate labels to vertices of unlabelled long contigs # ----------------------------------------------------- @@ -878,7 +857,7 @@ def main( weight=w_intra, ) - logger.debug("Total number of binned contigs: " + str(len(bin_of_contig))) + logger.debug(f"Total number of binned contigs: {len(bin_of_contig)}") # Further propagate labels to vertices of unlabelled long contigs # -------------------------------------------------------------------------------- @@ -906,7 +885,7 @@ def main( weight=w_inter, ) - logger.debug("Total number of binned contigs: " + str(len(bin_of_contig))) + logger.debug(f"Total number of binned contigs: {len(bin_of_contig)}") # Get binned and unbinned contigs # ----------------------------------------------------- @@ -915,8 +894,8 @@ def main( unbinned_contigs = list(set([x for x in range(node_count)]) - set(binned_contigs)) - logger.debug("Number of binned contigs: " + str(len(binned_contigs))) - logger.debug("Number of unbinned contigs: " + str(len(unbinned_contigs))) + logger.debug(f"Number of binned contigs: {len(binned_contigs)}") + logger.debug(f"Number of unbinned contigs: {len(unbinned_contigs)}") # Propagate labels to vertices of unlabelled long contigs in isolated components # ----------------------------------------------------------------------------------------------- @@ -1001,7 +980,7 @@ def main( contig_lengths=contig_lengths, ) - logger.debug("Total number of binned contigs: " + str(len(bin_of_contig))) + logger.debug(f"Total number of binned contigs: {len(bin_of_contig)}") # Further propagate labels to vertices of unlabelled long contigs # -------------------------------------------------------------------------------- @@ -1032,7 +1011,7 @@ def main( weight=MAX_WEIGHT, ) - logger.debug("Total number of binned contigs: " + str(len(bin_of_contig))) + logger.debug(f"Total number of binned contigs: {len(bin_of_contig)}") # Get elapsed time # ----------------------------------- @@ -1041,7 +1020,7 @@ def main( elapsed_time = time.time() - start_time # Print elapsed time for the process - logger.info("Elapsed time: " + str(elapsed_time) + " seconds") + logger.info(f"Elapsed time: {elapsed_time} seconds") # Get bin sizes # ----------------------------------- @@ -1075,14 +1054,7 @@ def main( no_possible_bins = True logger.debug( - "Bin " - + str(b) - + ": # contigs: " - + str(len(bins[b])) - + ", bin size: " - + str(bin_size[b]) - + "bp, # markers: " - + str(len(bin_markers[b])) + f"Bin {b}: # contigs: {len(bins[b])}, bin size: {bin_size[b]}bp, # markers: {len(bin_markers[b])}" ) min_pb = -1 @@ -1157,9 +1129,9 @@ def main( # Create output directory for bin files if not os.path.isdir(output_bins_path): - subprocess.run("mkdir -p " + output_bins_path, shell=True) + subprocess.run(f"mkdir -p {output_bins_path}", shell=True) if not os.path.isdir(lq_output_bins_path): - subprocess.run("mkdir -p " + lq_output_bins_path, shell=True) + subprocess.run(f"mkdir -p {lq_output_bins_path}", shell=True) final_bins = {} lowq_bins = {} diff --git a/metacoag_utils/feature_utils.py b/metacoag_utils/feature_utils.py index d307bca..eb9f1cf 100755 --- a/metacoag_utils/feature_utils.py +++ b/metacoag_utils/feature_utils.py @@ -78,10 +78,10 @@ def get_tetramer_profiles( contigs_file = contigs_file.split("/")[-1] if os.path.isfile( - output_path + contigs_file + ".normalized_contig_tetramers.pickle" + f"{output_path}{contigs_file}.normalized_contig_tetramers.pickle" ): with open( - output_path + contigs_file + ".normalized_contig_tetramers.pickle", "rb" + f"{output_path}{contigs_file}.normalized_contig_tetramers.pickle", "rb" ) as handle: normalized_tetramer_profiles = pickle.load(handle) @@ -103,7 +103,7 @@ def get_tetramer_profiles( i += 1 with open( - output_path + contigs_file + ".normalized_contig_tetramers.pickle", "wb" + f"{output_path}{contigs_file}.normalized_contig_tetramers.pickle", "wb" ) as handle: pickle.dump( normalized_tetramer_profiles, handle, protocol=pickle.HIGHEST_PROTOCOL @@ -157,9 +157,7 @@ def get_cov_len(contigs_file, contig_names_rev, min_length, abundance_file): coverages[contig_num].append(contig_coverage) if len(coverages) == 0: - logger.error( - "Could not find any contigs longer than " + str(min_length) + "bp." - ) + logger.error(f"Could not find any contigs longer than {min_length}bp.") logger.info("Exiting MetaCoAG... Bye...!") sys.exit(1) @@ -206,9 +204,7 @@ def get_cov_len_megahit( coverages[contig_num].append(contig_coverage) if len(coverages) == 0: - logger.error( - "Could not find any contigs longer than " + str(min_length) + "bp." - ) + logger.error(f"Could not find any contigs longer than {min_length}bp.") logger.info("Exiting MetaCoAG... Bye...!") sys.exit(1) diff --git a/metacoag_utils/graph_utils.py b/metacoag_utils/graph_utils.py index 458d147..9987f1a 100755 --- a/metacoag_utils/graph_utils.py +++ b/metacoag_utils/graph_utils.py @@ -1,7 +1,7 @@ #!/usr/bin/env python3 -import re import multiprocessing as mp +import re from collections import defaultdict from Bio import SeqIO @@ -305,7 +305,6 @@ def get_links_megahit(assembly_graph_file): def get_links_megahit_custom(assembly_graph_file): - my_map = BidirectionalMap() node_count = 0 @@ -399,7 +398,9 @@ def get_connected_components(i, assembly_graph, binned_contigs): def get_non_isolated(node_count, assembly_graph, binned_contigs, nthreads): - with mp.Pool(processes=nthreads) as pool: - non_isolated = pool.starmap(get_connected_components, [(i, assembly_graph, binned_contigs) for i in range(node_count)]) + non_isolated = pool.starmap( + get_connected_components, + [(i, assembly_graph, binned_contigs) for i in range(node_count)], + ) return non_isolated diff --git a/metacoag_utils/marker_gene_utils.py b/metacoag_utils/marker_gene_utils.py index b533ff8..d8ca8fc 100755 --- a/metacoag_utils/marker_gene_utils.py +++ b/metacoag_utils/marker_gene_utils.py @@ -20,8 +20,8 @@ def scan_for_marker_genes(contigs_file, nthreads, markerURL, hard=0): logger.info("Marker file: " + markerURL) - fragResultURL = contigs_file + ".frag.faa" - hmmResultURL = contigs_file + ".hmmout" + fragResultURL = f"{contigs_file}.frag.faa" + hmmResultURL = f"{contigs_file}.hmmout" if not (os.path.exists(fragResultURL)): fragCmd = ( fragScanURL @@ -37,7 +37,7 @@ def scan_for_marker_genes(contigs_file, nthreads, markerURL, hard=0): + contigs_file + ".frag.err" ) - logger.debug("exec cmd: " + fragCmd) + logger.debug(f"exec cmd: {fragCmd}") os.system(fragCmd) if os.path.exists(fragResultURL): @@ -58,15 +58,13 @@ def scan_for_marker_genes(contigs_file, nthreads, markerURL, hard=0): + hmmResultURL + ".err" ) - logger.debug("exec cmd: " + hmmCmd) + logger.debug(f"exec cmd: {hmmCmd}") os.system(hmmCmd) else: - logger.debug( - "HMMER search failed! Path: " + hmmResultURL + " does not exist." - ) + logger.debug(f"HMMER search failed! Path: {hmmResultURL} does not exist.") else: - logger.debug("FragGeneScan failed! Path: " + fragResultURL + " does not exist.") + logger.debug(f"FragGeneScan failed! Path: {fragResultURL} does not exist.") # Get contigs containing marker genes @@ -75,7 +73,7 @@ def get_all_contigs_with_marker_genes( ): contig_markers = {} - with open(contigs_file + ".hmmout", "r") as myfile: + with open(f"{contigs_file}.hmmout", "r") as myfile: for line in myfile.readlines(): if not line.startswith("#"): strings = line.strip().split() @@ -118,7 +116,7 @@ def get_contigs_with_marker_genes( marker_contig_counts = {} contig_markers = {} - with open(contigs_file + ".hmmout", "r") as myfile: + with open(f"{contigs_file}.hmmout", "r") as myfile: for line in myfile.readlines(): if not line.startswith("#"): strings = line.strip().split() @@ -188,7 +186,7 @@ def get_contigs_with_marker_genes_megahit( marker_contig_counts = {} contig_markers = {} - with open(contigs_file + ".hmmout", "r") as myfile: + with open(f"{contigs_file}.hmmout", "r") as myfile: for line in myfile.readlines(): if not line.startswith("#"): strings = line.strip().split() diff --git a/metacoag_utils/matching_utils.py b/metacoag_utils/matching_utils.py index e332c93..f279565 100755 --- a/metacoag_utils/matching_utils.py +++ b/metacoag_utils/matching_utils.py @@ -89,11 +89,7 @@ def match_contigs( for i in range(smg_iterations): logger.debug( - "Iteration " - + str(i) - + ": " - + str(len(smg_iteration[i])) - + " contig(s) with seed marker genes" + f"Iteration {i}: {len(smg_iteration[i])} contig(s) with seed marker genes" ) if i > 0: @@ -103,7 +99,7 @@ def match_contigs( set(smg_iteration[i]) ) to_bin = list(set(smg_iteration[i]) - common) - logger.debug(str(len(to_bin)) + " contig(s) to bin in the iteration") + logger.debug(f"{len(to_bin)} contig(s) to bin in the iteration") n_bins = len(bins) bottom_nodes = [] @@ -323,7 +319,7 @@ def match_contigs( n_bins += 1 binned_contigs_with_markers.append(longest_nb_contig) - logger.debug(str(binned_count) + " contig(s) binned in the iteration") + logger.debug(f"{binned_count} contig(s) binned in the iteration") if len(smg_iteration) > 0: del edge_weights_per_iteration diff --git a/metacoag_utils/support/combine_cov.py b/metacoag_utils/support/combine_cov.py index b7e30f0..d7ae0e1 100644 --- a/metacoag_utils/support/combine_cov.py +++ b/metacoag_utils/support/combine_cov.py @@ -39,11 +39,11 @@ def main(): # Handle for missing trailing forwardslash in output folder path if output_path[-1:] != "/": - output_path = output_path + "/" + output_path = f"{output_path}/" # Create output folder if it does not exist if not os.path.isdir(output_path): - subprocess.run("mkdir -p " + output_path, shell=True) + subprocess.run(f"mkdir -p {output_path}", shell=True) # Get coverage values from samples # --------------------------------------------------- @@ -66,8 +66,10 @@ def main(): print(f"Dataframe shape: {final_df.shape}") # Save dataframe to file - final_df.to_csv(output_path + "coverage.tsv", sep="\t", index=False, header=False) - final_df.to_csv(output_path + "coverage_with_header.tsv", sep="\t", index=False, header=True) + final_df.to_csv(f"{output_path}coverage.tsv", sep="\t", index=False, header=False) + final_df.to_csv( + f"{output_path}coverage_with_header.tsv", sep="\t", index=False, header=True + ) print(f"The combined coverage values can be found at {output_path}coverage.tsv") # Exit program diff --git a/setup.py b/setup.py index 71d6b81..204d19c 100644 --- a/setup.py +++ b/setup.py @@ -2,7 +2,6 @@ import setuptools - with open("README.md", "r") as fh: long_description = fh.read() @@ -47,7 +46,7 @@ "scipy", "numpy", "tqdm", - "pandas", + "pandas", "hmmer", ], python_requires=">=3.7", diff --git a/tests/test_metacoag.py b/tests/test_metacoag.py index 724abfd..794dc80 100644 --- a/tests/test_metacoag.py +++ b/tests/test_metacoag.py @@ -46,7 +46,7 @@ def get_files_and_seq_counts(output_path): seq_counts.append(seq_count) seq_counts.sort() - + return len(output_files), seq_counts @@ -105,5 +105,3 @@ def test_n_bins_metacoag_megahit(test_metacoag_megahit_command): # Assert bin sizes assert seq_counts == [36, 40, 46, 84, 127] - -