In [30]:
import sys
import os
from numbers import Number
import tempfile
import pexpect
import fileinput
import urllib.request
from Bio import SeqIO
from Bio import AlignIO

In [2]:
def format_to_ten_characters(sequence_name):
    """
    format_to_ten_characters is a function that limits the sequence_name length to 10.
    :param sequence_name:
    :return: sequence_name limited to 10 characters.
    """
    # if the sequence length is greater than 10 take just the first 10 characters and add 4 spaces this to create
    # more distance on the matrix.
    if len(sequence_name) > 10:
        return sequence_name[len(sequence_name)-10:]
    # if it's less than ten add blank spaces for the remaining, plus 4 more spaces.
    return sequence_name + ' ' * (10 - len(sequence_name))

In [3]:
##download data

testfile = {
    'an_ensembl.fa': 'http://www.csc.kth.se/utbildning/kth/kurser/DD2404/appbio12/labs/lab2/an_ensembl.fa',
    'small2.fa' :'http://www.csc.kth.se/utbildning/kth/kurser/DD2404/appbio12/labs/lab2/small2.fa',
    'small1.fa' :'http://www.csc.kth.se/utbildning/kth/kurser/DD2404/appbio12/labs/lab2/small1.fa'
}

for filename, url in testfile.items():
    with urllib.request.urlopen(url) as url:
        gene = url.read()
        gene = gene.strip().decode("utf-8")
    with open(filename, 'w') as f_out:
        f_out.write(gene)

In [4]:
## Phylip exe path
path='/Users/user/Desktop/MSM/03-Applied_Bioinformatics - KTH/Labs/applied_bioinformatics_DD2404/05-advanced-assignments/01-neighbor-program/phylip-3.695/exe'


In [26]:
infile= 'small2.fa'
bootstrap = 22

In [17]:
## Parse sequences
records = list(SeqIO.parse(infile, "fasta"))

In [18]:
longNames = {}
for i, record in enumerate(records):
    longNames[i] = record.id
    record.id = format_to_ten_characters(record.id)

In [19]:
tmp_file = tempfile.NamedTemporaryFile(mode='w')
SeqIO.write(records,tmp_file,"phylip")
tmp_file.seek(0)

0

In [20]:
old_dir = os.getcwd()
os.chdir( path )
#os.chdir( old_dir )

In [21]:
## bootstrap:
child = pexpect.spawn('rm infile outfile outtree')
child = pexpect.spawn('./seqboot')
sendList = [tmp_file.name,"R","55","Y","7"]
for redin in sendList:
    child.sendline(redin)
child.expect (pexpect.EOF)
child = pexpect.spawn("mv outfile infile")
child.close()

In [22]:
## distance matrix: 
child = pexpect.spawn('./protdist')
sendList = ["M", "D", "55", "Y"]
for redin in sendList:
    child.sendline(redin)
child.expect (pexpect.EOF)
child = pexpect.spawn("mv outfile infile")
child.close()

In [23]:
## phylotrees: 
child = pexpect.spawn('./neighbor')
sendList = ["M","55", "7", "Y"]
for redin in sendList:
    child.sendline(redin)
child.expect (pexpect.EOF)
child = pexpect.spawn("rm outfile infile")
child.close()

In [24]:
f = open("outtree")
for line in f:
    for i in longNames:
        line = line.replace(records[i].id, longNames[i])
    print(line.rstrip()),
f.close()

((ENSCAFP00000006409:0.00001,ENSCAFP00000032606:0.00000):0.14767,(ENSRNOP00000008865:0.13023,
(ENSPPYP00000012043:0.00144,ENSP00000366031:0.01601):0.13189):0.03772,ENSECAP00000000153:0.05010);
((ENSRNOP00000008865:0.14649,(ENSP00000366031:0.00753,ENSPPYP00000012043:0.00201):0.03382):0.03647,
(ENSCAFP00000006409:0.00001,ENSCAFP00000032606:0.00000):0.13274,ENSECAP00000000153:0.06500);
((ENSRNOP00000008865:0.18790,(ENSP00000366031:0.00873,ENSPPYP00000012043:0.00032):0.12461):0.06022,
(ENSCAFP00000032606:0.00001,ENSCAFP00000006409:0.00000):0.09551,ENSECAP00000000153:0.07658);
(ENSRNOP00000008865:0.08055,((ENSP00000366031:0.01332,ENSPPYP00000012043:0.00581):0.07628,
(ENSCAFP00000032606:0.00001,ENSCAFP00000006409:0.00000):0.09801):0.01845,ENSECAP00000000153:0.05275);
(((ENSPPYP00000012043:0.00001,ENSP00000366031:0.00000):0.15659,(ENSCAFP00000006409:0.00001,
ENSCAFP00000032606:0.00000):0.10410):0.02675,ENSRNOP00000008865:0.13399,ENSECAP00000000153:0.06829);
((ENSRNOP00000008865:0.10736,(ENSP0

In [25]:
os.chdir( old_dir )