-
Notifications
You must be signed in to change notification settings - Fork 3
/
prune_tree.py
executable file
·105 lines (93 loc) · 2.94 KB
/
prune_tree.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
#!/usr/bin/env python
#
# prune_tree.py
#
# Author: Gregory Mendez
#
# This script will compare an alignment file (in Fasta or Phylip format) to a newick tree file,
# and create a new tree file containing only species also in the alignment file.
#
# This script takes 2 arguments:
# This first argument must be the newick formatted tree file.
# The second argument must be the fasta or phylip alignment.
#
# Example usage:
# prune_tree.py constraint.tre big_alignment.fasta
from Bio import Phylo
import sys, re
#Set tree to the first command line argument (0 is the script itself)
tree = Phylo.read(sys.argv[1], "newick")
INPUT = sys.argv[2]
SPECIES_LIST = []
TREE_LIST = []
GENE = sys.argv[2].split(".")[0]
OUTPUT = '%s.%s' % (GENE, sys.argv[1])
#Check if input alignment is a fasta or phylip file
with open(INPUT, 'r') as FILE_CHECK:
for LINE in FILE_CHECK:
if ">" in LINE:
FORMAT = str("FASTA")
break
else:
FORMAT = str("PHYLIP")
break
#FUNCTIONS
# Search tree by clade name
def lookup_by_names(tree):
names = {}
for clade in tree.find_clades():
if clade.name:
if clade.name in names:
raise ValueError("Duplicate key: %s" % clade.name)
names[clade.name] = clade
return names
# Name all unnamed clades
def tabulate_names(tree):
names = {}
for idx, clade in enumerate(tree.find_clades()):
if clade.name:
clade.name = clade.name
else:
clade.name = str(idx)
names[clade.name] = clade
return names
# Name all the clades in the tree
tabulate_names(tree)
# Fill Dictionary with clade names
names = lookup_by_names(tree)
# Make list of all species in tree.
for TERMINAL in names['0'].get_terminals():
TREE_LIST.append(TERMINAL.name)
#Make list of all species in alignment
SEQIDS = {}
with open(INPUT, 'r') as ALIGNMENT:
for LINE in ALIGNMENT:
if FORMAT is str("FASTA"):
if ">" in LINE:
SPECIES = re.sub(r'>([0-9A-Za-z_]+)___\n', r'\1', LINE)
SEQID = re.sub(r'>[0-9A-Za-z_]+___([0-9A-Za-z_|.]+)\n', r'\1', LINE)
if SPECIES in TREE_LIST:
SPECIES_LIST.append(SPECIES)
SEQIDS[SPECIES] = SEQID
else:
if FORMAT is str("PHYLIP"):
if re.match(r'\w', LINE):
SPECIES = LINE.split()[0].split("___")[0]
SEQID = LINE.split()[0].split("___")[1]
if SPECIES in TREE_LIST:
SPECIES_LIST.append(SPECIES)
SEQIDS[SPECIES] = SEQID
# Make list of species in tree that aren't in alignment
MISSING_LIST = []
for ITEM in TREE_LIST:
if ITEM not in SPECIES_LIST:
MISSING_LIST.append(ITEM)
# Remove missing species from tree and write this edited tree out to a new file
for PRUNE in MISSING_LIST:
tree.prune(PRUNE)
# Edit names in tree to include the sequence id from the alignment file
for idx, clade in enumerate(tree.find_clades()):
for KEY, VALUE in SEQIDS.iteritems():
if (clade.name == KEY):
clade.name = '___'.join([KEY, VALUE])
Phylo.write(tree, OUTPUT, 'newick')