In [1]:
import pandas as pd
from Bio import SeqIO, AlignIO, Seq
import numpy as np
from collections import Counter
from helper import *
import python_cipres.client as CipresClient
from ete3 import Tree, TreeStyle

In [2]:
def check_avg_length(node,th=0.4):
    desc_length = np.mean([x.tot_dist for x in node.get_leaves()]) - node.tot_dist
    if desc_length < th:
        return True
    else:
        return False

def collapse_tree(node,th=0.4):
    global cluster_id
    cluster_list = pd.DataFrame([],columns=['leaf_id','cluster_id'])
    if check_avg_length(node,th):
        cluster_list = pd.DataFrame([[x.name, cluster_id] for x in node.get_leaves()],columns=['leaf_id','cluster_id'])
        cluster_id = cluster_id + 1
    else:
        for child in node.get_children():
            #print(child.name)
            desc_list = collapse_tree(child,th)
            cluster_list = pd.concat([cluster_list,desc_list])
    return cluster_list

In [18]:
t = Tree('../output/02_90p_autotrophic_rubisco_tree/RaxML/RAxML_bestTree.result')
for leaf in t.traverse():
    leaf.add_feature('tot_dist',np.array([x.dist for x in leaf.get_ancestors()]).sum()+leaf.dist)

In [19]:
global cluster_id
cluster_id = 0
collapsed_nodes = collapse_tree(t,0.4)

In [34]:
auto_rub90 = pd.read_csv('../output/02_90p_autotrophic_rubisco_tree/uclust_all_0.9_with_type.csv')
auto_rub90
auto_rub90['cut Target'] = auto_rub90.Target_y.apply(lambda x: x.split(' ')[0])
auto_rub90 = auto_rub90.merge(collapsed_nodes,left_on='cut Target',right_on='leaf_id')
#collapsed_nodes
#auto_rub90.loc[~auto_rub90['cut Target'].isin(collapsed_nodes['leaf_id']),'cut Target']
#auto_rub90['cut Target'].nunique()
auto_rub90.groupby('type')['cluster_id'].nunique()

type
I           24
II          32
II/III       6
III-like    14
IIIa        24
IIIc        18
Name: cluster_id, dtype: int64

In [73]:
collapsed_nodes['synth'] = collapsed_nodes['leaf_id'].str.startswith('RBC')
prune_list = []
for x in collapsed_nodes.groupby('cluster_id'):
    x2 = x[1]
    x3 = x2[x2['synth']==True]
    if len(x3)>0:
        prune_list.append(x3.iloc[0].leaf_id)
        collapsed_nodes.loc[collapsed_nodes.cluster_id==x[0],'Centroid'] = x3.iloc[0].leaf_id
    else:
        prune_list.append(x2.iloc[0].leaf_id)
        collapsed_nodes.loc[collapsed_nodes.cluster_id==x[0],'Centroid'] = x2.iloc[0].leaf_id

In [59]:
t_pruned = t.copy()
t_pruned.prune(prune_list,preserve_branch_length=True)
t_pruned.describe()

Number of leaf nodes:	124
Total number of nodes:	246
Rooted:	No
Most distant node:	TARA_070.SAMEA2621092.800.0.22-3_1641160_1
Max. distance:	5.586885


In [60]:
!mkdir -p ../output/03_branch_length_cluster
t_pruned.write(features=[],outfile='../output/03_branch_length_cluster/pruned_tree0.4.nwk')

In [87]:
kinetic_data = pd.DataFrame([x.description for x in SeqIO.parse('../data/flamholz_et_al_2019_kinetically_characterized.faa', "fasta")],columns=['kinetic_ID'])
uclust_data = pd.read_csv('../output/02_90p_autotrophic_rubisco_tree/auto_uclust_all_0.9.csv')
uclust_data = uclust_data[uclust_data['Type'] !='S']
uclust_data.loc[uclust_data['Target'] == '*','Target'] = uclust_data.loc[uclust_data['Target'] == '*','Query']
uclust_data['cut Target'] = uclust_data.Target.apply(lambda x: x.split(' ')[0])
uclust_data = uclust_data.merge(kinetic_data, left_on='Query', right_on='kinetic_ID',how='left')
uclust_data = uclust_data.merge(collapsed_nodes, left_on='cut Target', right_on='leaf_id',how='left')
kinetic_measured = uclust_data[~pd.isna(uclust_data['kinetic_ID'])]
lines = kinetic_measured['Centroid'].apply(lambda x: x.split(' ')[0]).values + ',1,-1\n'

synth_data = pd.DataFrame([x.description for x in SeqIO.parse('../data/milo_synthetized_rubisco.faa', "fasta")],columns=['syn_ID'])
uclust_data = uclust_data.merge(synth_data, left_on='Query', right_on='syn_ID',how='left')
synth = uclust_data[~pd.isna(uclust_data['syn_ID'])]
lines_synth = synth['Centroid'].apply(lambda x: x.split(' ')[0]).values + ',-1,1\n'

In [None]:


with open('../data/kinetic_sampling_legend.txt','r') as file:
    with open(outfile, "w") as f1:
        for row in file:
            f1.write(row)
        for line in lines:
            f1.write(line)
        for line in lines_synth:
            f1.write(line)
        file.close()
        f1.close()