In [2]:
import pandas as pd
from Bio import SeqIO, AlignIO, Seq
import numpy as np
from collections import Counter
from helper import *
import python_cipres.client as CipresClient
from ete3 import Tree, TreeStyle

In [3]:
def check_avg_length(node,th=0.4):
    desc_length = np.mean([x.tot_dist for x in node.get_leaves()]) - node.tot_dist
    if desc_length < th:
        return True
    else:
        return False

def collapse_tree(node,th=0.4):
    global cluster_id
    cluster_list = pd.DataFrame([],columns=['leaf_id','cluster_id'])
    if check_avg_length(node,th):
        cluster_list = pd.DataFrame([[x.name, cluster_id] for x in node.get_leaves()],columns=['leaf_id','cluster_id'])
        cluster_id = cluster_id + 1
    else:
        for child in node.get_children():
            #print(child.name)
            desc_list = collapse_tree(child,th)
            cluster_list = pd.concat([cluster_list,desc_list])
    return cluster_list

In [10]:
t = Tree('../output/02_90p_autotrophic_rubisco_tree/RaxML/RAxML_bestTree.result')
auto_rub90 = pd.read_csv('../output/02_90p_autotrophic_rubisco_tree/uclust_all_0.9_with_type.csv')
t.set_outgroup(t.get_leaves_by_name(auto_rub90.loc[auto_rub90['type'] == 'IV-outgroup','Target_y'].values[0])[0])
for leaf in t.traverse():
    leaf.add_feature('tot_dist',np.array([x.dist for x in leaf.get_ancestors()]).sum()+leaf.dist)

In [11]:
global cluster_id
cluster_id = 0
collapsed_nodes = collapse_tree(t,0.4)

In [12]:
g = t.get_leaves_by_name('gi|1133712163|ref|WP_076097556.1|')[0]
g.get_ancestors()[0].get_leaves()

[Tree node 'gi|1133712163|ref|WP_076097556.1|' (-0x7ffff8036504ed54),
 Tree node 'RBC4_62' (0x7fc9afab81d),
 Tree node 'RBC_81' (0x7fc9afb12b0)]

In [13]:
auto_rub90 = pd.read_csv('../output/02_90p_autotrophic_rubisco_tree/uclust_all_0.9_with_type.csv')
auto_rub90
auto_rub90['cut Target'] = auto_rub90.Target_y.apply(lambda x: x.split(' ')[0])
auto_rub90 = auto_rub90.merge(collapsed_nodes,left_on='cut Target',right_on='leaf_id')
#collapsed_nodes
#auto_rub90.loc[~auto_rub90['cut Target'].isin(collapsed_nodes['leaf_id']),'cut Target']
#auto_rub90['cut Target'].nunique()
auto_rub90.groupby('type')['cluster_id'].nunique()
#auto_rub90.to_csv('../output/03_branch_length_cluster/uclust_0.9_with_bl_0.4.csv')

type
I              27
II             15
II/III          6
III-like       12
IIIa           23
IIIc           17
IV-outgroup     1
Name: cluster_id, dtype: int64

In [14]:
collapsed_nodes['synth'] = collapsed_nodes['leaf_id'].str.startswith('RBC')
prune_list = []
for x in collapsed_nodes.groupby('cluster_id'):
    x2 = x[1]
    x3 = x2[x2['synth']==True]
    if len(x3)>0:
        prune_list.append(x3.iloc[0].leaf_id)
        collapsed_nodes.loc[collapsed_nodes.cluster_id==x[0],'Centroid'] = x3.iloc[0].leaf_id
    else:
        prune_list.append(x2.iloc[0].leaf_id)
        collapsed_nodes.loc[collapsed_nodes.cluster_id==x[0],'Centroid'] = x2.iloc[0].leaf_id

In [15]:
t_pruned = t.copy()
t_pruned.prune(prune_list,preserve_branch_length=True)
t_pruned.describe()

Number of leaf nodes:	101
Total number of nodes:	201
Rooted:	Yes
Most distant node:	RBC_41
Max. distance:	6.208213


In [16]:
!mkdir -p ../output/03_branch_length_cluster
t_pruned.write(features=[],outfile='../output/03_branch_length_cluster/pruned_tree0.4.nwk')

In [18]:
kinetic_data = pd.DataFrame([x.description for x in SeqIO.parse('../data/flamholz_et_al_2019_kinetically_characterized.faa', "fasta")],columns=['kinetic_ID'])
uclust_data = pd.read_csv('../output/02_90p_autotrophic_rubisco_tree/auto_uclust_all_0.9.csv')
uclust_data = uclust_data[uclust_data['Type'] !='S']
uclust_data.loc[uclust_data['Target'] == '*','Target'] = uclust_data.loc[uclust_data['Target'] == '*','Query']
uclust_data['cut Target'] = uclust_data.Target.apply(lambda x: x.split(' ')[0])
uclust_data = uclust_data.merge(kinetic_data, left_on='Query', right_on='kinetic_ID',how='left')
uclust_data = uclust_data.merge(collapsed_nodes, left_on='cut Target', right_on='leaf_id',how='left')

synth_data = pd.DataFrame([x.description for x in SeqIO.parse('../data/milo_synthetized_rubisco.faa', "fasta")],columns=['syn_ID'])
uclust_data = uclust_data.merge(synth_data, left_on='Query', right_on='syn_ID',how='left')

uclust_data['kinetic_flag'] = '-1'
uclust_data['syn_flag'] = '-1'

kinetic_centroid = uclust_data.loc[~pd.isna(uclust_data['kinetic_ID']),'Centroid'].unique()
syn_centroid = uclust_data.loc[~pd.isna(uclust_data['syn_ID']),'Centroid'].unique()
uclust_data.loc[uclust_data['Centroid'].isin(kinetic_centroid),'kinetic_flag'] = '1'
uclust_data.loc[uclust_data['Centroid'].isin(syn_centroid),'syn_flag'] = '1'

lines = uclust_data['Centroid'].apply(lambda x: x.split(' ')[0]).values + ','+ uclust_data['kinetic_flag'].values+','+uclust_data['syn_flag'].values+'\n'

unique_lines = np.unique(lines)

In [19]:
with open('../data/kinetic_sampling_legend.txt','r') as file:
    with open('../output/03_branch_length_cluster/kinetic_legend.txt', "w") as f1:
        for row in file:
            f1.write(row)
        for line in unique_lines:
            f1.write(line)
        file.close()
        f1.close()

In [20]:
type_data = pd.read_csv('../data/jaffe_et_al_2018_rubisco_types.csv')
uclust_data = uclust_data.merge(type_data, left_on='Query', right_on='ID',how='left')
type_centroids = uclust_data.loc[~pd.isna(uclust_data['type']),['Centroid','type']]
uclust_data = uclust_data.merge(type_centroids,left_on='Centroid',right_on='Centroid')

In [21]:
color_map = {'I': '#28B463',
             'II': '#E74C3C',
             'II/III':'#AF7AC5',
             'IIIa':'#AED6F1',
             'IIIb':'#3498DB',
             'IIIc':'#1F618D',
             'IIIlike':'#5D6D7E',
             'IV':'#F4D03F',
             'IVlike':'#F8C471',
             'unknown':'#F442D4'}
lines = uclust_data['Centroid'].apply(lambda x: x.split(' ')[0]).values +[',label,node,'+color_map[x]+',1,normal\n' for x in uclust_data['type_y'].values]
lines = set(lines)
with open('../data/itol_legend_template.txt','r') as file:
    with open('../output/03_branch_length_cluster/type_legend.txt', "w") as f1:
        for row in file:
            f1.write(row)
        for line in lines:
            f1.write(line)
        file.close()
        f1.close()