Skip to content

Commit

Permalink
MAINT: format code
Browse files Browse the repository at this point in the history
  • Loading branch information
Vini2 committed May 15, 2023
1 parent 09f36e8 commit e7ca2a0
Show file tree
Hide file tree
Showing 8 changed files with 64 additions and 102 deletions.
96 changes: 34 additions & 62 deletions metacoag
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
#!/usr/bin/env python3

import click
import concurrent.futures
import csv
import gc
Expand All @@ -12,11 +11,13 @@ import subprocess
import sys
import time

import click
from Bio import SeqIO
from igraph import *
from tqdm import tqdm

from metacoag_utils import (feature_utils, graph_utils, label_prop_utils, marker_gene_utils, matching_utils)
from metacoag_utils import (feature_utils, graph_utils, label_prop_utils,
marker_gene_utils, matching_utils)
from metacoag_utils.bidirectionalmap import BidirectionalMap

__author__ = "Vijini Mallawaarachchi and Yu Lin"
Expand Down Expand Up @@ -245,7 +246,7 @@ def main(
# Validate prefix
if prefix != None:
if not prefix.endswith("_"):
prefix = prefix + "_"
prefix = f"{prefix}_"
else:
prefix = ""

Expand Down Expand Up @@ -392,7 +393,7 @@ def main(

# Add vertices
assembly_graph.add_vertices(node_count)
logger.info("Total number of contigs available: " + str(node_count))
logger.info(f"Total number of contigs available: {node_count}")

# Name vertices with contig identifiers
for i in range(node_count):
Expand Down Expand Up @@ -429,10 +430,7 @@ def main(
# Simplify the graph
assembly_graph.simplify(multiple=True, loops=False, combine_edges=None)

logger.info(
"Total number of edges in the assembly graph: "
+ str(len(list(assembly_graph.es)))
)
logger.info(f"Total number of edges in the assembly graph: {len(list(assembly_graph.es))}")

except:
logger.error(
Expand All @@ -454,7 +452,7 @@ def main(
# Get isolated contigs with no neighbours
isolated = graph_utils.get_isolated(node_count, assembly_graph)

logger.info("Total isolated contigs in the assembly graph: " + str(len(isolated)))
logger.info(f"Total isolated contigs in the assembly graph: {len(isolated)}")

# Get the number of samples and the length and coverage of contigs
# ------------------------------------------------------------------------
Expand Down Expand Up @@ -495,19 +493,17 @@ def main(
if contig_lengths[contig] >= min_length:
isolated_long.append(contig)

logger.info("Total long contigs: " + str(my_long))
logger.info(
"Total isolated long contigs in the assembly graph: " + str(len(isolated_long))
)
logger.info(f"Total long contigs: {my_long}")
logger.info(f"Total isolated long contigs in the assembly graph: {len(isolated_long)}")

# Set intra weight and inter weight
# ------------------------------------------------------------------------

w_intra = bin_threshold * (n_samples + 1)
w_inter = break_threshold * (n_samples + 1)

logger.debug("w_intra: " + str(w_intra))
logger.debug("w_inter: " + str(w_inter))
logger.debug(f"w_intra: {w_intra}")
logger.debug("w_inter: {w_inter}")

# Get tetramer composition of contigs
# ------------------------------------------------------------------------
Expand All @@ -531,7 +527,7 @@ def main(

logger.info("Scanning for single-copy marker genes")

if not os.path.exists(contigs_file + ".hmmout"):
if not os.path.exists(f"{contigs_file}.hmmout"):
# Check if FragGeneScan is installed
try:
p = subprocess.run(["which", "run_FragGeneScan.pl"], capture_output=True)
Expand Down Expand Up @@ -598,10 +594,7 @@ def main(
mg_length_threshold=mg_threshold,
)

logger.info(
"Number of contigs containing single-copy marker genes: "
+ str(len(contig_markers))
)
logger.info(f"Number of contigs containing single-copy marker genes: {len(contig_markers)}")

# Check if there are contigs with single-copy marker genes
if len(contig_markers) == 0:
Expand Down Expand Up @@ -683,7 +676,7 @@ def main(

bin_markers[i] = contig_markers[contig_num]

logger.debug("Number of initial bins detected: " + str(len(smg_iteration[0])))
logger.debug(f"Number of initial bins detected: {len(smg_iteration[0])}")
logger.debug("Initialised bins: ")
logger.debug(bins)

Expand Down Expand Up @@ -717,17 +710,14 @@ def main(
d_limit=d_limit,
)

logger.debug("Number of bins after matching: " + str(len(bins)))
logger.debug(f"Number of bins after matching: {len(bins)}")

logger.debug("Bins with contigs containing seed marker genes")

for b in bins:
logger.debug(str(b) + ": " + str(bins[b]))
logger.debug(f"{b}: {bins[b]}")

logger.debug(
"Number of binned contigs with single-copy marker genes: "
+ str(len(bin_of_contig))
)
logger.debug(f"Number of binned contigs with single-copy marker genes: {len(bin_of_contig)}")

del smg_iteration
del my_gene_counts
Expand Down Expand Up @@ -757,10 +747,7 @@ def main(
unbinned_mg_contig_lengths.items(), key=operator.itemgetter(1), reverse=True
)

logger.debug(
"Number of unbinned contigs with single-copy marker genes: "
+ str(len(unbinned_mg_contigs))
)
logger.debug(f"Number of unbinned contigs with single-copy marker genes: {len(unbinned_mg_contigs)}")

logger.info("Further assigning contigs with single-copy marker genes")

Expand Down Expand Up @@ -790,13 +777,8 @@ def main(
set(contig_markers.keys()) - set(binned_contigs_with_markers)
)

logger.debug(
"Remaining number of unbinned MG seed contigs: " + str(len(unbinned_mg_contigs))
)
logger.debug(
"Number of binned contigs with single-copy marker genes: "
+ str(len(bin_of_contig))
)
logger.debug(f"Remaining number of unbinned MG seed contigs: {len(unbinned_mg_contigs)}")
logger.debug(f"Number of binned contigs with single-copy marker genes: {len(bin_of_contig)}")

del unbinned_mg_contigs
del unbinned_mg_contig_lengths
Expand Down Expand Up @@ -830,12 +812,9 @@ def main(

unbinned_contigs = list(set([x for x in range(node_count)]) - set(binned_contigs))

logger.debug("Number of binned contigs: " + str(len(binned_contigs)))
logger.debug("Number of unbinned contigs: " + str(len(unbinned_contigs)))
logger.debug(
"Number of binned contigs with markers: "
+ str(len(binned_contigs_with_markers))
)
logger.debug(f"Number of binned contigs: {len(binned_contigs)}")
logger.debug(f"Number of unbinned contigs: {len(unbinned_contigs)}")
logger.debug(f"Number of binned contigs with markers: {len(binned_contigs_with_markers)}")

# Get components without labels
# -----------------------------------------------------
Expand All @@ -848,7 +827,7 @@ def main(
nthreads=nthreads
)

logger.debug("Number of non-isolated contigs: " + str(len(non_isolated)))
logger.debug(f"Number of non-isolated contigs: {len(non_isolated)}")

# Propagate labels to vertices of unlabelled long contigs
# -----------------------------------------------------
Expand Down Expand Up @@ -878,7 +857,7 @@ def main(
weight=w_intra,
)

logger.debug("Total number of binned contigs: " + str(len(bin_of_contig)))
logger.debug(f"Total number of binned contigs: {len(bin_of_contig)}")

# Further propagate labels to vertices of unlabelled long contigs
# --------------------------------------------------------------------------------
Expand Down Expand Up @@ -906,7 +885,7 @@ def main(
weight=w_inter,
)

logger.debug("Total number of binned contigs: " + str(len(bin_of_contig)))
logger.debug(f"Total number of binned contigs: {len(bin_of_contig)}")

# Get binned and unbinned contigs
# -----------------------------------------------------
Expand All @@ -915,8 +894,8 @@ def main(

unbinned_contigs = list(set([x for x in range(node_count)]) - set(binned_contigs))

logger.debug("Number of binned contigs: " + str(len(binned_contigs)))
logger.debug("Number of unbinned contigs: " + str(len(unbinned_contigs)))
logger.debug(f"Number of binned contigs: {len(binned_contigs)}")
logger.debug(f"Number of unbinned contigs: {len(unbinned_contigs)}")

# Propagate labels to vertices of unlabelled long contigs in isolated components
# -----------------------------------------------------------------------------------------------
Expand Down Expand Up @@ -1001,7 +980,7 @@ def main(
contig_lengths=contig_lengths,
)

logger.debug("Total number of binned contigs: " + str(len(bin_of_contig)))
logger.debug(f"Total number of binned contigs: {len(bin_of_contig)}")

# Further propagate labels to vertices of unlabelled long contigs
# --------------------------------------------------------------------------------
Expand Down Expand Up @@ -1032,7 +1011,7 @@ def main(
weight=MAX_WEIGHT,
)

logger.debug("Total number of binned contigs: " + str(len(bin_of_contig)))
logger.debug(f"Total number of binned contigs: {len(bin_of_contig)}")

# Get elapsed time
# -----------------------------------
Expand All @@ -1041,7 +1020,7 @@ def main(
elapsed_time = time.time() - start_time

# Print elapsed time for the process
logger.info("Elapsed time: " + str(elapsed_time) + " seconds")
logger.info(f"Elapsed time: {elapsed_time} seconds")

# Get bin sizes
# -----------------------------------
Expand Down Expand Up @@ -1075,14 +1054,7 @@ def main(
no_possible_bins = True

logger.debug(
"Bin "
+ str(b)
+ ": # contigs: "
+ str(len(bins[b]))
+ ", bin size: "
+ str(bin_size[b])
+ "bp, # markers: "
+ str(len(bin_markers[b]))
f"Bin {b}: # contigs: {len(bins[b])}, bin size: {bin_size[b]}bp, # markers: {len(bin_markers[b])}"
)

min_pb = -1
Expand Down Expand Up @@ -1157,9 +1129,9 @@ def main(

# Create output directory for bin files
if not os.path.isdir(output_bins_path):
subprocess.run("mkdir -p " + output_bins_path, shell=True)
subprocess.run(f"mkdir -p {output_bins_path}", shell=True)
if not os.path.isdir(lq_output_bins_path):
subprocess.run("mkdir -p " + lq_output_bins_path, shell=True)
subprocess.run(f"mkdir -p {lq_output_bins_path}", shell=True)

final_bins = {}
lowq_bins = {}
Expand Down
14 changes: 5 additions & 9 deletions metacoag_utils/feature_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,10 +78,10 @@ def get_tetramer_profiles(
contigs_file = contigs_file.split("/")[-1]

if os.path.isfile(
output_path + contigs_file + ".normalized_contig_tetramers.pickle"
f"{output_path}{contigs_file}.normalized_contig_tetramers.pickle"
):
with open(
output_path + contigs_file + ".normalized_contig_tetramers.pickle", "rb"
f"{output_path}{contigs_file}.normalized_contig_tetramers.pickle", "rb"
) as handle:
normalized_tetramer_profiles = pickle.load(handle)

Expand All @@ -103,7 +103,7 @@ def get_tetramer_profiles(
i += 1

with open(
output_path + contigs_file + ".normalized_contig_tetramers.pickle", "wb"
f"{output_path}{contigs_file}.normalized_contig_tetramers.pickle", "wb"
) as handle:
pickle.dump(
normalized_tetramer_profiles, handle, protocol=pickle.HIGHEST_PROTOCOL
Expand Down Expand Up @@ -157,9 +157,7 @@ def get_cov_len(contigs_file, contig_names_rev, min_length, abundance_file):
coverages[contig_num].append(contig_coverage)

if len(coverages) == 0:
logger.error(
"Could not find any contigs longer than " + str(min_length) + "bp."
)
logger.error(f"Could not find any contigs longer than {min_length}bp.")
logger.info("Exiting MetaCoAG... Bye...!")
sys.exit(1)

Expand Down Expand Up @@ -206,9 +204,7 @@ def get_cov_len_megahit(
coverages[contig_num].append(contig_coverage)

if len(coverages) == 0:
logger.error(
"Could not find any contigs longer than " + str(min_length) + "bp."
)
logger.error(f"Could not find any contigs longer than {min_length}bp.")
logger.info("Exiting MetaCoAG... Bye...!")
sys.exit(1)

Expand Down
9 changes: 5 additions & 4 deletions metacoag_utils/graph_utils.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#!/usr/bin/env python3

import re
import multiprocessing as mp
import re
from collections import defaultdict

from Bio import SeqIO
Expand Down Expand Up @@ -305,7 +305,6 @@ def get_links_megahit(assembly_graph_file):


def get_links_megahit_custom(assembly_graph_file):

my_map = BidirectionalMap()

node_count = 0
Expand Down Expand Up @@ -399,7 +398,9 @@ def get_connected_components(i, assembly_graph, binned_contigs):


def get_non_isolated(node_count, assembly_graph, binned_contigs, nthreads):

with mp.Pool(processes=nthreads) as pool:
non_isolated = pool.starmap(get_connected_components, [(i, assembly_graph, binned_contigs) for i in range(node_count)])
non_isolated = pool.starmap(
get_connected_components,
[(i, assembly_graph, binned_contigs) for i in range(node_count)],
)
return non_isolated

0 comments on commit e7ca2a0

Please sign in to comment.