-
Notifications
You must be signed in to change notification settings - Fork 0
/
fasta_to_nexus.py
49 lines (44 loc) · 1.36 KB
/
fasta_to_nexus.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
#!/usr/bin/python
'''
usage: python fasta_to_nexus.py
Run script in directory of fasta files to convert. Creates text file of species names separated from corresponding sequences by tab.
E.g.,
Canis familiaris\tACTG...\n
Felis catus\tACTG...\n
Mus musculus\tACTG...
Lucy Tran
Department of Ecology and Evolutionary Biology
University of Michigan, Ann Arbor
May 3, 2010
'''
import glob
import re
for x in glob.iglob('*_2a.fasta'):
infile = x
outfilename_re = re.compile(r'(.*?)_2a.fasta')
outfilename = outfilename_re.findall(x)
outfilename2 = outfilename[0]
file = open(infile, 'r')
text = file.readlines()
text2 = ''.join(text)
file.close()
#compile regular expressions to search for headers and sequences
header_re = re.compile(r'(>.*?)\n(?:-|[A-Z]{10})')
seq_re = re.compile(r'>[a-zA-Z0-9_]*?\n([A-Z\n\-]{10,60000})')
header = header_re.findall(text2)
seq = seq_re.findall(text2)
#construct list of corresponding headers and sequences
headseqlist = []
for i in range(len(header)):
headeritem = header[i]
seqitem = seq[i]
seqitem2 = seqitem.replace('\n','')
headseq = headeritem + '\t' + seqitem2 + '\n'
headseqlist.append(headseq)
#print nexus file
newtext = ''.join(headseqlist)
template = '%s_2a.nex'
outfilename3 = template % (outfilename2)
outfile = open(outfilename3, 'w')
outfile.write(newtext.encode('utf-8'))
print 'Fasta file writing complete!'