In [2]:
import pandas as pd
from Bio import SeqIO, AlignIO, Seq
import numpy as np
from collections import Counter
from helper import *
import python_cipres.client as CipresClient
from ete3 import Tree, TreeStyle

In [3]:
def check_avg_length(node,th=0.4):
    desc_length = np.mean([x.tot_dist for x in node.get_leaves()]) - node.tot_dist
    if desc_length < th:
        return True
    else:
        return False

def collapse_tree(node,th=0.4):
    global cluster_id
    cluster_list = pd.DataFrame([],columns=['leaf_id','cluster_id'])
    if check_avg_length(node,th):
        cluster_list = pd.DataFrame([[x.name, cluster_id] for x in node.get_leaves()],columns=['leaf_id','cluster_id'])
        cluster_id = cluster_id + 1
    else:
        for child in node.get_children():
            #print(child.name)
            desc_list = collapse_tree(child,th)
            cluster_list = pd.concat([cluster_list,desc_list])
    return cluster_list

In [107]:
type1_sub = pd.read_csv('../output/02_90p_autotrophic_rubisco_tree/rubisco_type1_subtypes.csv')
auto_90p = pd.read_csv('../output/02_90p_autotrophic_rubisco_tree/auto_uclust_all_0.9_no_outliers_with_type.csv')
auto_90p['ID_90'] = auto_90p.Target_90.apply(lambda x: x.split(' ')[0])
auto_90p_with_type = auto_90p.merge(type1_sub,left_on='ID_90',right_on='Label',how='outer')
auto_90p_with_type.loc[auto_90p_with_type.type =='I','type'] = auto_90p_with_type.loc[auto_90p_with_type.type =='I','subtype'] + '-' + auto_90p_with_type.loc[auto_90p_with_type.type =='I','domain']
auto_90p_with_type.loc[auto_90p_with_type.type.isna(),'type'] = 'unknown'
auto_90p_with_type.groupby(['type'])['Target_90'].nunique()

type
1-Prok      191
1a-Prok     102
1b-Euk      136
1b-Prok      20
1d-Euk       77
1e-Prok      32
II          109
II/III       29
III-like     60
IIIa         68
IIIc         29
unknown      48
Name: Target_90, dtype: int64

In [28]:
t = Tree('../output/02_90p_autotrophic_rubisco_tree/RaxML/RAxML_bipartitions.result',format=2)
#t.set_outgroup(t.get_leaves_by_name(auto_90p_with_type.loc[auto_90p_with_type['type'] == 'IV-outgroup','Target_y'].values[0])[0])
t.set_outgroup(t.get_leaves_by_name('gi|1490259551|gb|RKZ25299.1|')[0])

for leaf in t.traverse():
    leaf.add_feature('tot_dist',np.array([x.dist for x in leaf.get_ancestors()]).sum()+leaf.dist)

In [113]:
global cluster_id
cluster_id = 0
collapsed_nodes = collapse_tree(t,0.43)
collapsed_auto_90p = auto_90p_with_type.merge(collapsed_nodes,left_on='ID_90',right_on='leaf_id')
collapsed_auto_90p.groupby('type')['cluster_id'].nunique()
#collapsed_auto_90p.groupby('cluster_id')['type'].nunique().sort_values(ascending=False)
#collapsed_auto_90p[collapsed_auto_90p.cluster_id==7][['subtype','domain']].drop_duplicates()
collapsed_auto_90p.groupby('type')['cluster_id'].nunique()

type
1-Prok      28
1a-Prok      3
1b-Euk       1
1b-Prok      5
1d-Euk       1
1e-Prok      1
II          10
II/III       5
III-like    11
IIIa        33
IIIc        17
unknown     30
Name: cluster_id, dtype: int64

In [122]:
collapsed_auto_90p.groupby('cluster_id')['type'].nunique().max()
sum(collapsed_auto_90p['ID_90'].str.startswith('RBC'))

0

In [114]:
collapsed_nodes['synth'] = collapsed_nodes['leaf_id'].str.startswith('RBC')
prune_list = []
for x in collapsed_nodes.groupby('cluster_id'):
    x2 = x[1]
    x3 = x2[x2['synth']==True]
    if len(x3)>0:
        prune_list.append(x3.iloc[0].leaf_id)
        collapsed_nodes.loc[collapsed_nodes.cluster_id==x[0],'Centroid'] = x3.iloc[0].leaf_id
    else:
        prune_list.append(x2.iloc[0].leaf_id)
        collapsed_nodes.loc[collapsed_nodes.cluster_id==x[0],'Centroid'] = x2.iloc[0].leaf_id

In [115]:
t_pruned = t.copy()
t_pruned.prune(prune_list,preserve_branch_length=True)
t_pruned.describe()

Number of leaf nodes:	145
Total number of nodes:	289
Rooted:	Yes
Most distant node:	gi|1247451339|gb|PCJ09133.1|
Max. distance:	4.837283


In [116]:
!mkdir -p ../output/03_branch_length_cluster
t_pruned.write(features=[],outfile='../output/03_branch_length_cluster/pruned_tree0.43.nwk')

In [117]:
kinetic_data = pd.DataFrame([x.description for x in SeqIO.parse('../data/flamholz_et_al_2019_kinetically_characterized.faa', "fasta")],columns=['kinetic_ID'])
uclust_data = pd.read_csv('../output/02_90p_autotrophic_rubisco_tree/auto_uclust_all_0.9.csv')
uclust_data = uclust_data[uclust_data['Type'] !='S']
uclust_data.loc[uclust_data['Target'] == '*','Target'] = uclust_data.loc[uclust_data['Target'] == '*','Query']
uclust_data['cut Target'] = uclust_data.Target.apply(lambda x: x.split(' ')[0])
uclust_data = uclust_data.merge(kinetic_data, left_on='Query', right_on='kinetic_ID',how='left')
uclust_data = uclust_data.merge(collapsed_nodes, left_on='cut Target', right_on='leaf_id',how='left')

synth_data = pd.DataFrame([x.description for x in SeqIO.parse('../data/milo_synthetized_rubisco.faa', "fasta")],columns=['syn_ID'])
uclust_data = uclust_data.merge(synth_data, left_on='Query', right_on='syn_ID',how='left')

uclust_data['kinetic_flag'] = '-1'
uclust_data['syn_flag'] = '-1'

kinetic_centroid = uclust_data.loc[~pd.isna(uclust_data['kinetic_ID']),'Centroid'].unique()
syn_centroid = uclust_data.loc[~pd.isna(uclust_data['syn_ID']),'Centroid'].unique()
uclust_data.loc[uclust_data['Centroid'].isin(kinetic_centroid),'kinetic_flag'] = '1'
uclust_data.loc[uclust_data['Centroid'].isin(syn_centroid),'syn_flag'] = '1'

lines = uclust_data['Centroid'].apply(lambda x: x.split(' ')[0]).values + ','+ uclust_data['kinetic_flag'].values+','+uclust_data['syn_flag'].values+'\n'

unique_lines = np.unique(lines)

AttributeError: 'float' object has no attribute 'split'

In [98]:
with open('../data/kinetic_sampling_legend.txt','r') as file:
    with open('../output/03_branch_length_cluster/kinetic_legend.txt', "w") as f1:
        for row in file:
            f1.write(row)
        for line in unique_lines:
            f1.write(line)
        file.close()
        f1.close()

In [20]:
type_data = pd.read_csv('../data/jaffe_et_al_2018_rubisco_types.csv')
uclust_data = uclust_data.merge(type_data, left_on='Query', right_on='ID',how='left')
type_centroids = uclust_data.loc[~pd.isna(uclust_data['type']),['Centroid','type']]
uclust_data = uclust_data.merge(type_centroids,left_on='Centroid',right_on='Centroid')

In [21]:
color_map = {'I': '#28B463',
             'II': '#E74C3C',
             'II/III':'#AF7AC5',
             'IIIa':'#AED6F1',
             'IIIb':'#3498DB',
             'IIIc':'#1F618D',
             'IIIlike':'#5D6D7E',
             'IV':'#F4D03F',
             'IVlike':'#F8C471',
             'unknown':'#F442D4'}
lines = uclust_data['Centroid'].apply(lambda x: x.split(' ')[0]).values +[',label,node,'+color_map[x]+',1,normal\n' for x in uclust_data['type_y'].values]
lines = set(lines)
with open('../data/itol_legend_template.txt','r') as file:
    with open('../output/03_branch_length_cluster/type_legend.txt', "w") as f1:
        for row in file:
            f1.write(row)
        for line in lines:
            f1.write(line)
        file.close()
        f1.close()

In [99]:
pruned_type = auto_rub90.loc[auto_rub90['cut Target90'].isin(prune_list),['Centroid','type']]
pruned_type.loc[pruned_type['type'].isin(['1b','1d']),'type'] = 'I-euk'
pruned_type.loc[(pruned_type['type']!='I-euk') & (pruned_type['type'].str.startswith('1')),'type'] = 'I-prok'
pruned_type.type.unique()

array(['II/III', 'II', 'I-prok', 'III-like', 'I-euk', 'IIIa', 'IIIc',
       'IV-outgroup'], dtype=object)

In [100]:


color_map = {'I-prok': '#bdff5b',
             'I-euk': '#28B463',
             'II': '#E74C3C',
             'II/III':'#AF7AC5',
             'IIIa':'#AED6F1',
             'IIIb':'#3498DB',
             'IIIc':'#1F618D',
             'III-like':'#5D6D7E',
             'IV-outgroup':'#F4D03F',
             'IVlike':'#F8C471',
             'unknown':'#F442D4'}
lines = pruned_type['Centroid'].apply(lambda x: x.split(' ')[0]).values +[',label,node,'+color_map[x]+',1,normal\n' for x in pruned_type['type'].values]
lines = set(lines)
with open('../data/itol_legend_template.txt','r') as file:
    with open('../output/03_branch_length_cluster/type_legend.txt', "w") as f1:
        for row in file:
            f1.write(row)
        for line in lines:
            f1.write(line)
        file.close()
        f1.close()