Skip to content
This repository has been archived by the owner on Sep 7, 2023. It is now read-only.

Commit

Permalink
Feature #5 resolve multifurcation (#58)
Browse files Browse the repository at this point in the history
* Add resolve multifurcation function using treerecs

* Add test for resolve multifuracation method
  • Loading branch information
moshi4 committed Oct 22, 2021
1 parent 52262a4 commit a5a367f
Show file tree
Hide file tree
Showing 8 changed files with 167 additions and 18 deletions.
4 changes: 2 additions & 2 deletions src/fastdtlmapper/args.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ def get_args(argv: Optional[List[str]] = None) -> Args:
"--tree",
required=True,
type=Path,
help="Input rooted species newick tree file (timetree is preferable)",
help="Input rooted species newick tree file",
metavar="TREE",
)
parser.add_argument(
Expand Down Expand Up @@ -138,7 +138,7 @@ def get_args(argv: Optional[List[str]] = None) -> Args:
parser.add_argument(
"--inflation",
type=float,
help=f"MCL inflation parameter (Default: {default_inflation})",
help=f"OrthoFinder MCL inflation parameter (Default: {default_inflation})",
default=default_inflation,
metavar="",
)
Expand Down
Binary file added src/fastdtlmapper/bin/treerecs
Binary file not shown.
13 changes: 13 additions & 0 deletions src/fastdtlmapper/cmd.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,19 @@ def get_iqtree_cmd(
+ f"--seed {self.params.rseed} {boot_opt} --redo --quiet 2>&1"
)

def get_treerecs_cmd(
self,
species_tree_file: Union[str, Path],
boot_tree_file: Union[str, Path],
outdir: Union[str, Path],
) -> str:
"""Get Treerecs run command"""
return (
f"treerecs -s {species_tree_file} -g {boot_tree_file} -o {outdir} "
+ f"--dupcost {self.params.dup_cost} --losscost {self.params.los_cost} "
+ "--output-without-description -f --quiet 2>&1"
)

def get_angst_cmd(
self,
species_tree_file: Union[str, Path],
Expand Down
9 changes: 5 additions & 4 deletions src/fastdtlmapper/out_path.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,10 +69,11 @@ def __post_init__(self):

self.tmp_parallel_cmds_file = self.parallel_cmds_dir / "tmp_parallel_cmds.txt"

self.mafft_output_log_file = self.parallel_cmds_dir / "mafft_output_log.csv"
self.trimal_output_log_file = self.parallel_cmds_dir / "trimal_output_log.csv"
self.iqtree_output_log_file = self.parallel_cmds_dir / "iqtree_output_log.csv"
self.angst_output_log_file = self.parallel_cmds_dir / "angst_output_log.csv"
self.mafft_log_file = self.parallel_cmds_dir / "mafft_log.csv"
self.trimal_log_file = self.parallel_cmds_dir / "trimal_log.csv"
self.iqtree_log_file = self.parallel_cmds_dir / "iqtree_log.csv"
self.treerecs_log_file = self.parallel_cmds_dir / "treerecs_log.csv"
self.angst_log_file = self.parallel_cmds_dir / "angst_log.csv"

self._makedirs()

Expand Down
50 changes: 41 additions & 9 deletions src/fastdtlmapper/scripts/FastDTLmapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ def main(args: Optional[Args] = None):

# Check user input
InputCheck(args.indir, args.tree_file).run()
# # 00. Prepare analysis data
# 00. Prepare analysis data
format_user_tree(args.tree_file, outpath, cmd)
format_user_fasta(args.indir, outpath.user_fasta_dir)
# 01. Grouping ortholog sequences using OrthoFinder
Expand All @@ -41,9 +41,11 @@ def main(args: Optional[Args] = None):
trimal_run(outpath, cmd)
# 04. Reconstruct each OG gene tree using iqtree
iqtree_run(outpath, cmd)
# 05. DTL reconciliation using AnGST
# 05. Correct gene tree multifurcation using treerecs
treerecs_run(outpath, cmd)
# 06. DTL reconciliation using AnGST
angst_run(outpath, cmd)
# 06. Aggregate and map DTL reconciliation result
# 07. Aggregate and map DTL reconciliation result
group_id2all_node_event = aggregate_dtl_results(args, outpath)
output_aggregate_map_results(outpath, group_id2all_node_event)
output_aggregate_transfer_results(outpath)
Expand Down Expand Up @@ -71,9 +73,10 @@ def format_user_fasta(fasta_dir: Path, format_fasta_dir: Path) -> None:
fasta_outfile = format_fasta_dir / infile.with_suffix(".fa").name
species_name = infile.stem
if infile.suffix in (".fa", ".faa", ".fasta"):
# Add serial number annotation to fasta
UtilFasta(infile).add_serial_id(fasta_outfile, species_name)
elif infile.suffix in (".gb", ".gbk", ".genbank"):

# Convert genbank to fasta with serial number annotation
UtilGenbank(infile).convert_cds_fasta(
fasta_outfile, "protein", species_name
)
Expand Down Expand Up @@ -116,7 +119,7 @@ def mafft_run(outpath: OutPath, cmd: Cmd) -> None:
mafft_cmd_list.append(mafft_cmd)

cmd.run_parallel_cmd(
mafft_cmd_list, outpath.tmp_parallel_cmds_file, outpath.mafft_output_log_file
mafft_cmd_list, outpath.tmp_parallel_cmds_file, outpath.mafft_log_file
)


Expand All @@ -132,7 +135,7 @@ def trimal_run(outpath: OutPath, cmd: Cmd) -> None:
shutil.copy(aln_file, aln_trim_file)

cmd.run_parallel_cmd(
trimal_cmd_list, outpath.tmp_parallel_cmds_file, outpath.trimal_output_log_file
trimal_cmd_list, outpath.tmp_parallel_cmds_file, outpath.trimal_log_file
)

# If trimal removed sequence, use raw mafft alignment in next step
Expand Down Expand Up @@ -170,22 +173,51 @@ def iqtree_run(outpath: OutPath, cmd: Cmd) -> None:
UtilTree.make_3genes_tree(aln_trim_file, gene_tree_file)

cmd.run_parallel_cmd(
iqtree_cmd_list, outpath.tmp_parallel_cmds_file, outpath.iqtree_output_log_file
iqtree_cmd_list, outpath.tmp_parallel_cmds_file, outpath.iqtree_log_file
)


def treerecs_run(outpath: OutPath, cmd: Cmd) -> None:
"""Run treerecs"""
print("\n# 05. Correct gene tree multifurcation using treerecs")
treerecs_cmd_list, treerecs_outfile_list = [], []
for boot_tree_file in sorted(outpath.dtl_rec_dir.glob("**/iqtree/*.ufboot")):
outdir = boot_tree_file.parent.parent / "treerecs"
outdir.mkdir(exist_ok=True)
# Multifurcate IQ-TREE boot trees
group_id = boot_tree_file.parent.parent.name
multifurcate_boot_tree_file = outdir / (group_id + "_multifurcate.ufboot")
UtilTree.multifurcate_zero_length_nodes(
boot_tree_file, multifurcate_boot_tree_file
)
treerecs_cmd = cmd.get_treerecs_cmd(
outpath.um_tree_file, multifurcate_boot_tree_file, outdir
)
treerecs_cmd_list.append(treerecs_cmd)

treerecs_outfile = outdir / (multifurcate_boot_tree_file.name + "_recs.nwk")
treerecs_outfile_list.append(treerecs_outfile)

cmd.run_parallel_cmd(
treerecs_cmd_list, outpath.tmp_parallel_cmds_file, outpath.treerecs_log_file
)
# Unroot treerecs rooted tree (AnGST only accepts unrooted tree)
for treerecs_outfile in treerecs_outfile_list:
UtilTree.unroot_tree(treerecs_outfile, treerecs_outfile)


def angst_run(outpath: OutPath, cmd: Cmd) -> None:
"""Run AnGST"""
print("\n# 05. DTL reconciliation using AnGST")
angst_cmd_list = []
for boot_tree_file in sorted(outpath.dtl_rec_dir.glob("**/*.ufboot")):
for boot_tree_file in sorted(outpath.dtl_rec_dir.glob("**/*_recs.nwk")):
outdir = boot_tree_file.parent.parent / "angst"
outdir.mkdir(exist_ok=True)
angst_cmd = cmd.get_angst_cmd(outpath.um_tree_file, boot_tree_file, outdir)
angst_cmd_list.append(angst_cmd)

cmd.run_parallel_cmd(
angst_cmd_list, outpath.tmp_parallel_cmds_file, outpath.angst_output_log_file
angst_cmd_list, outpath.tmp_parallel_cmds_file, outpath.angst_log_file
)


Expand Down
4 changes: 2 additions & 2 deletions src/fastdtlmapper/setup_binpath.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,15 +17,15 @@ class SetupBinpath:
"mafft",
"trimal",
"iqtree",
"treerecs",
"AnGST.py",
"AnGST_wrapper.py",
"parallel",
]
)

def __post_init__(self):
bin_path_list = self._get_bin_path_list()
self._add_bin_path(bin_path_list)
self._add_bin_path(self._get_bin_path_list())
self._bin_exists_check(self.bin_list)

def _get_bin_path_list(self) -> List[Path]:
Expand Down
75 changes: 74 additions & 1 deletion src/fastdtlmapper/util/tree.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
from dataclasses import dataclass
from pathlib import Path
from typing import Dict, Union
from typing import Dict, List, Union

from Bio import Phylo, SeqIO
from Bio.Phylo.BaseTree import Tree
from ete3 import Tree as EteTree
from fastdtlmapper.angst.model import NodeEvent


Expand Down Expand Up @@ -107,3 +108,75 @@ def map_node_event(
replace_tree_info = f.read().replace(":0.000000;", ";").replace("'", "")
with open(nwk_tree_outfile, "w") as f:
f.write(replace_tree_info)

@staticmethod
def multifurcate_zero_length_nodes(
gene_trees_infile: Union[str, Path],
multifurcate_outfile: Union[str, Path],
max_tree_num: int = 100,
) -> None:
"""Multifurcate zero branch length nodes (For IQ-TREE randomly bifurcated tree)
Args:
gene_trees_infile (Union[str, Path]): IQ-TREE gene trees file
multifurcate_outfile (Union[str, Path]): Multifurcate gene trees file
max_tree_num (int, optional): Number of max tree output
Note:
IQ-TREE randomly bifurcate identical sequences with zero branch length.
Random bifurcated topology make the DTL reconciliation result worse.
In order to resolve randomly bifurcated tree using 'treerecs' software,
multifurcate zero branch length IQ-TREE gene trees to use as treerecs input.
"""
# Load gene trees
gene_tree_list: List[EteTree] = []
with open(gene_trees_infile) as f:
tree_text_lines = f.read().splitlines()
for tree_text in tree_text_lines[0:max_tree_num]:
gene_tree_list.append(EteTree(tree_text))

# Unroot all zero branch length nodes in gene trees
gene_tree_text_list = []
for gene_tree in gene_tree_list:
gene_tree.set_outgroup(gene_tree.get_midpoint_outgroup())
for node in gene_tree.traverse(strategy="postorder"):
descendants_node_list = node.get_descendants()
if len(descendants_node_list) <= 2:
continue
descendants_total_dist = sum([n.dist for n in descendants_node_list])
if descendants_total_dist == 0:
node.unroot()
# gene_tree.unroot()
gene_tree_text_list.append(gene_tree.write(dist_formatter="%1.10f"))

# Output unrooted gene trees
with open(multifurcate_outfile, "w") as f:
f.write("\n".join(gene_tree_text_list))

@staticmethod
def unroot_tree(
gene_trees_infile: Union[str, Path],
unroot_gene_trees_outfile: Union[str, Path],
) -> None:
"""Unroot tree
Args:
gene_trees_infile (Union[str, Path]): IQ-TREE gene trees file
unroot_gene_trees_outfile (Union[str, Path]): Output unroot gene trees file
"""
# Load gene trees
gene_tree_list: List[EteTree] = []
with open(gene_trees_infile) as f:
tree_text_lines = f.read().splitlines()
for tree_text in tree_text_lines:
gene_tree_list.append(EteTree(tree_text))

# Unroot gene trees
gene_tree_text_list = []
for gene_tree in gene_tree_list:
gene_tree.unroot()
gene_tree_text_list.append(gene_tree.write(dist_formatter="%1.10f"))

# Output unrooted gene trees
with open(unroot_gene_trees_outfile, "w") as f:
f.write("\n".join(gene_tree_text_list))
30 changes: 30 additions & 0 deletions tests/util/test_tree.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from pathlib import Path

from ete3 import Tree
from fastdtlmapper.util import UtilTree


Expand Down Expand Up @@ -41,3 +42,32 @@ def test_make_3gene_tree(tmp_path: Path):
UtilTree.make_3genes_tree(fasta_3gene_file, gene_tree_outfile)
gene_tree_text = gene_tree_outfile.read_text()
assert gene_tree_text == "(insp1:0.01,insp2:0.01,outsp1:0.01);"


def test_multifurcate_zero_length_nodes(tmp_path: Path):
"""test multifurcate_zero_length_nodes"""
root_tree_file = tmp_path / "root_tree.nwk"
root_tree_text = "(((a:0,b:0):0,c:0):1,(d:1,e:1):1);"
root_tree_file.write_text(root_tree_text)
root_tree = Tree(root_tree_text)

multifurcate_outfile = tmp_path / "multifuracate_tree.nwk"
UtilTree.multifurcate_zero_length_nodes(root_tree_file, multifurcate_outfile)
multifurcate_tree = Tree(multifurcate_outfile.read_text())

assert len(root_tree.get_common_ancestor("a", "b", "c").children) == 2
assert len(multifurcate_tree.get_common_ancestor("a", "b", "c").children) == 3


def test_unroot_tree(tmp_path: Path):
"""test unroot_tree"""
root_tree_file = tmp_path / "root_tree.nwk"
root_tree_text = "(((a,b),c),(d,e));"
root_tree_file.write_text(root_tree_text)

unroot_tree_file = tmp_path / "unroot_tree.nwk"
UtilTree.unroot_tree(root_tree_file, unroot_tree_file)
unroot_tree = Tree(unroot_tree_file.read_text())

assert len(Tree(root_tree_text).get_tree_root().children) == 2
assert len(unroot_tree.children) == 3

0 comments on commit a5a367f

Please sign in to comment.