In [98]:
import pandas as pd
import os
from itertools import product
import numpy as np
import ete3
import re

ncbi     = ete3.NCBITaxa()
%cd /work/bisson/searches

/work/bisson/searches


In [4]:
genomes  = set()
proteins = set()
for filename in os.listdir('.'):
    if filename.startswith('.'):
        continue
    genome, protein = filename.split('-')
    genomes.add(genome)
    proteins.add(protein)

In [46]:
df = pd.DataFrame(columns='genome protein target_name e-value score bias description'.split())
for genome, protein in product(genomes, proteins):
    filename = '%s-%s' % (genome, protein)
    lines    = [line.strip() for line in open(filename).readlines() if not line.startswith('#')]
    tmp_data = []
    for line in lines:
        line = line.split()
        tmp_data.append([genome, protein, line[0]])
        tmp_data[-1].extend(line[4:7])
        tmp_data[-1].append(' '.join(line[18:]))
    df = df.append(pd.DataFrame(data=tmp_data, columns=df.columns), ignore_index=True)
df['e-value'] = df['e-value'].astype(np.float64)
df['score']   = df['score'].astype(np.float64)
df['bias']    = df['bias'].astype(np.float64)

In [68]:
assigned_proteins = pd.DataFrame(columns=df.columns)
for genome in genomes:
    tmp_df  = df[df.genome == genome]
    indexes = []
    for protein in tmp_df.protein.unique():
        indexes.append(tmp_df[tmp_df.protein==protein]['e-value'].idxmin())
    
    tmp_df  = tmp_df.reindex(index=indexes)
    indexes = []
    for target in tmp_df.target_name.unique():
        indexes.append(tmp_df[tmp_df.target_name==target]['e-value'].idxmin())
    tmp_df  = tmp_df.reindex(index=indexes)
    assigned_proteins = assigned_proteins.append(tmp_df, ignore_index=True)

In [88]:
assigned_proteins.head()

Unnamed: 0,genome,protein,target_name,e-value,score,bias,description
0,GCA_003662775.1,FtsZ1,RLI57951.1,2.8e-138,458.9,5.4,cell division protein FtsZ [Candidatus Thorarc...
1,GCA_003662775.1,FtsZ2,RLI53936.1,1.9e-154,512.2,8.4,cell division protein FtsZ [Candidatus Thorarc...
2,GCA_001940705.1,FtsZ1,OLS26346.1,6.5e-138,457.1,7.3,Cell division protein FtsZ 1 [Candidatus Thora...
3,GCA_001940705.1,FtsZ2,OLS23328.1,1.6e-154,511.9,6.5,Cell division protein FtsZ 1 [Candidatus Thora...
4,GCA_001940645.1,FtsZ1,OLS21975.1,3e-132,439.4,9.6,Cell division protein FtsZ 1 [Candidatus Heimd...


In [129]:
print assigned_proteins.loc[(assigned_proteins.genome=='GCA_001940725.1')]
print assigned_proteins.loc[(assigned_proteins.target_name=='OLS17704.1')]


             genome protein target_name       e-value  score  bias  \
35  GCA_001940725.1   CetZ2  OLS29073.1  3.300000e-14   50.6   0.1   

                                          description  
35  Tubulin-like protein CetZ [Candidatus Heimdall...  
             genome protein target_name        e-value  score  bias  \
12  GCA_001940665.1   FtsZ2  OLS17704.1  2.300000e-154  510.5  10.0   

                                          description  
12  Cell division protein FtsZ 1 [Candidatus Odina...  


In [86]:
%cd /work/bisson/
gene_names = {'ENOG410KRFR': 'FtsZ2',
              'ENOG410KRFT': 'FtsZ1',
              'ENOG410KS48': 'CetZ2',
              'ENOG410KRU8': 'CetZ1'}
out = open('extended_groups/FtsZ_homologues.faa', 'a')
for genome in assigned_proteins.genome.unique():
    faa = open('asgard_genomes/%s.faa' % genome).read().split('>')
    faa.pop(0)
    homologues = assigned_proteins.target_name[assigned_proteins.genome == genome].tolist()
    for block in faa:
        header = block.split()[0]
        if header in homologues:
            sequence = ''.join(block.split('\n')[1:])
            out.write('>%s|%s\n%s\n'  %(genome, header, sequence))
out.close()

/work/bisson


In [131]:
tree_txt = open('../extended_groups/FtsZ_homologues.aln.treefile').read()
for index, row in assigned_proteins.iterrows():
    tree_txt = tree_txt.replace('%s_%s' % (row.genome, row.target_name), '%s|%s' % (row.genome, row.target_name))

tree     = ete3.Tree(tree_txt, format=1)
id_table = {}
for fastaname, gene_name in gene_names.items():
    ids = re.findall('^>(\S+)',
                     open('../eggnog4/candidate_ftsz/eurNOG.%s.meta_raw.fa' % fastaname).read(),
                    re.M)
    id_table.update({gene_id:gene_name for gene_id in ids})

In [132]:
out  = open('../extended_groups/FtsZ_homologues.figTree', 'wb')
out.write("#NEXUS\nbegin taxa;\n\tdimensions ntax=%i;\n\ttaxlabels\n" %len(tree))
branch_names = {}
for node in tree.traverse():
    if node.is_leaf():
        if node.name.startswith('GCA_'):
            taxid = 1935183
            genome, gene = node.name.split('|')
            print genome, gene
            gene_type = assigned_proteins.loc[(assigned_proteins.genome==genome) &
                                              (assigned_proteins.target_name==gene),
                                              'protein'].squeeze()
        else:
            taxid, gene = node.name.split('.')
            gene_type = id_table[node.name]

        lineage = {j: i for i, j in ncbi.get_rank(ncbi.get_lineage(int(taxid))).items()}
        lineage_names = ncbi.get_taxid_translator(lineage.values())

        out.write('\t%s ' % (node.name))
        comment = ['gene_type="%s"' % gene_type]
        for rank in ['class', 'phylum', 'order', 'family', 'species']:
            if rank in lineage:
                comment.append('tax_%s="%s"' % (rank, lineage_names[lineage[rank]]))
        out.write('[&%s]\n' %' '.join(comment))

    else:
        if node.name:
            aLRT, UFBoot = node.name.split('/')
            node.name = '[&UFBoot=%.2f,aLRT=%.2f]' %(float(UFBoot), float(aLRT))

newick_text = tree.write(format=1)
newick_text = re.sub('_&UFBoot_(\d+\.\d\d)_aLRT_(\d+\.\d\d)_', '[&UFBoot=\\1,aLRT=\\2]', newick_text)
out.write(';\nend;\n')
out.write('begin trees;\n\ttree tree_1 = [&R] %s\nend;' %newick_text)
out.close()

GCA_001940665.1 OLS17704.1
GCA_003662865.1 RLI65521.1
GCA_000986845.1 KKK42581.1
GCA_003662935.1 RLI69574.1
GCA_003662885.1 RLI68895.1
GCA_003662835.1 RLI63391.1
GCA_003662835.1 RLI64272.1
GCA_001940725.1 OLS29073.1
GCA_003662815.1 RLI57875.1
GCA_001940705.1 OLS23328.1
GCA_003345555.1 RDE15852.1
GCA_003345595.1 RDE17459.1
GCA_003662805.1 RLI61079.1
GCA_003662765.1 RLI51259.1
GCA_001940665.1 OLS17546.1
GCA_003662775.1 RLI53936.1
GCA_003345545.1 RDE10823.1
GCA_001940645.1 OLS21975.1
GCA_003144275.1 PWI49717.1
GCA_002728275.1 MBS85472.1
GCA_001940755.1 OLS31474.1
GCA_001940655.1 OLS14163.1
GCA_001563325.1 KXH77811.1
GCA_003662935.1 RLI70985.1
GCA_003345555.1 RDE16533.1
GCA_003345595.1 RDE12356.1
GCA_003662805.1 RLI61572.1
GCA_003662765.1 RLI50938.1
GCA_003662875.1 RLI68069.1
GCA_003662885.1 RLI73395.1
GCA_003345545.1 RDE15114.1
GCA_001940705.1 OLS26346.1
GCA_003662775.1 RLI57951.1
GCA_003662815.1 RLI58918.1
GCA_001563465.1 KXH75745.1
GCA_001563335.1 KXH74114.1


In [101]:
node.name

'[&UFBoot=44.00,aLRT=55.10]'