In [4]:
import pandas as pd
from Bio import SeqIO, AlignIO, Seq
import numpy as np
from collections import Counter
from helper import *
import python_cipres.client as CipresClient
from ete3 import Tree, TreeStyle

In [5]:
def check_avg_length(node,th=0.4):
    desc_length = np.mean([x.tot_dist for x in node.get_leaves()]) - node.tot_dist
    if desc_length < th:
        return True
    else:
        return False

def collapse_tree(node,th=0.4):
    global cluster_id
    cluster_list = pd.DataFrame([],columns=['leaf_id','cluster_id'])
    if check_avg_length(node,th):
        cluster_list = pd.DataFrame([[x.name, cluster_id] for x in node.get_leaves()],columns=['leaf_id','cluster_id'])
        cluster_id = cluster_id + 1
    else:
        for child in node.get_children():
            #print(child.name)
            desc_list = collapse_tree(child,th)
            cluster_list = pd.concat([cluster_list,desc_list])
    return cluster_list

In [6]:
type1_sub = pd.read_csv('../output/02_90p_autotrophic_rubisco_tree/rubisco_type1_subtypes.csv')
auto_90p = pd.read_csv('../output/02_90p_autotrophic_rubisco_tree/auto_uclust_all_0.9_no_outliers_with_type.csv')
auto_90p['ID_90'] = auto_90p.Target_90.apply(lambda x: x.split(' ')[0])
auto_90p_with_type = auto_90p.merge(type1_sub,left_on='ID_90',right_on='Label',how='outer')
auto_90p_with_type.loc[auto_90p_with_type.type =='I','type'] = auto_90p_with_type.loc[auto_90p_with_type.type =='I','subtype'] + '-' + auto_90p_with_type.loc[auto_90p_with_type.type =='I','domain']
auto_90p_with_type.loc[auto_90p_with_type.type.isna(),'type'] = 'unknown'
auto_90p_with_type.groupby(['type'])['Target_90'].nunique()

type
1-Prok          89
1A-Prok        102
1B-Euk         136
1B-Prok         20
1C-Prok        126
1D-Euk          77
II             174
II/III          52
III-like        66
IIIa            66
IIIc            36
IV-outgroup      1
unknown         64
Name: Target_90, dtype: int64

In [13]:
auto_90_id_type = auto_90p_with_type.loc[~auto_90p_with_type[['Target_90','type']].duplicated(),['Target_90','type']]
auto_90_id_type.columns = ['ID','type']
!mkdir -p ../output/03_branch_length_cluster
auto_90_id_type.to_csv('../output/03_branch_length_cluster/90_label_type_id.csv',index=False)

In [14]:
auto_90p_with_type[auto_90p_with_type.type == 'IV-outgroup']

Unnamed: 0.1,Unnamed: 0,Type_70,Cluster_70,Size_70,%Id_70,Strand_70,Qlo_70,Tlo_70,Alignment_70,Query,...,%Id_90,Strand_90,Qlo_90,Tlo_90,Alignment_90,Target_90,ID_90,Label,subtype,domain
37062,37060,C,165,1,*,*,*,*,*,"gi|1160377249|ref|WP_079421992.1| ribulose 1,5...",...,*,*,*,*,*,"gi|1160377249|ref|WP_079421992.1| ribulose 1,5...",gi|1160377249|ref|WP_079421992.1|,,,


In [15]:
t = Tree('../output/02_90p_autotrophic_rubisco_tree/RaxML/RAxML_bipartitions.result',format=2)
#t.set_outgroup(t.get_leaves_by_name(auto_90p_with_type.loc[auto_90p_with_type['type'] == 'IV-outgroup','Target_y'].values[0])[0])
t.set_outgroup(t.get_leaves_by_name('gi|1129860231|gb|OLS27808.1|')[0])

for leaf in t.traverse():
    leaf.add_feature('tot_dist',np.array([x.dist for x in leaf.get_ancestors()]).sum()+leaf.dist)

In [18]:
global cluster_id
cluster_id = 0
collapsed_nodes = collapse_tree(t,0.39)
collapsed_auto_90p = auto_90p_with_type.merge(collapsed_nodes,left_on='ID_90',right_on='leaf_id')
collapsed_auto_90p.groupby('type')['cluster_id'].nunique()
#collapsed_auto_90p.groupby('cluster_id')['type'].nunique().sort_values(ascending=False)
#collapsed_auto_90p[collapsed_auto_90p.cluster_id==7][['subtype','domain']].drop_duplicates()
collapsed_auto_90p.groupby('type')['cluster_id'].nunique()

type
1-Prok         18
1A-Prok         8
1B-Euk          1
1B-Prok         6
1C-Prok         8
1D-Euk          1
II             10
II/III          6
III-like       11
IIIa           31
IIIc           20
IV-outgroup     1
unknown        42
Name: cluster_id, dtype: int64

In [19]:
collapsed_auto_90p.groupby('cluster_id')['type'].nunique().sort_values()
#sum(collapsed_auto_90p['ID_90'].str.startswith('RBC'))
#collapsed_auto_90p.groupby('cluster_id')['type'].unique().loc[35]

cluster_id
0      1
104    1
105    1
106    1
107    1
108    1
109    1
110    1
103    1
111    1
113    1
114    1
115    1
116    1
117    1
118    1
119    1
112    1
102    1
101    1
100    1
83     1
84     1
85     1
86     1
87     1
88     1
89     1
90     1
91     1
      ..
72     1
73     1
74     1
75     1
76     1
77     1
78     1
71     1
61     1
60     1
59     1
42     1
43     1
44     1
45     1
46     1
47     1
48     1
49     1
50     1
51     1
52     1
53     1
54     1
55     1
56     1
57     1
58     1
40     1
162    1
Name: type, Length: 163, dtype: int64

In [20]:
collapsed_nodes['synth'] = collapsed_nodes['leaf_id'].str.startswith('RBC')
prune_list = []
for x in collapsed_nodes.groupby('cluster_id'):
    x2 = x[1]
    x3 = x2[x2['synth']==True]
    if len(x3)>0:
        prune_list.append(x3.iloc[0].leaf_id)
        collapsed_nodes.loc[collapsed_nodes.cluster_id==x[0],'Centroid'] = x3.iloc[0].leaf_id
        collapsed_auto_90p.loc[collapsed_auto_90p.cluster_id==x[0],'Centroid'] = x3.iloc[0].leaf_id
    else:
        prune_list.append(x2.iloc[0].leaf_id)
        collapsed_nodes.loc[collapsed_nodes.cluster_id==x[0],'Centroid'] = x2.iloc[0].leaf_id
        collapsed_auto_90p.loc[collapsed_auto_90p.cluster_id==x[0],'Centroid'] = x2.iloc[0].leaf_id

In [21]:
t_pruned = t.copy()
t_pruned.prune(prune_list,preserve_branch_length=True)
t_pruned.describe()

Number of leaf nodes:	163
Total number of nodes:	325
Rooted:	Yes
Most distant node:	RBC_41
Max. distance:	4.757974


In [22]:
t_pruned.write(features=[],outfile='../output/03_branch_length_cluster/pruned_tree0.39.nwk')

In [23]:
collapsed_auto_90p.to_csv('../output/03_branch_length_cluster/data_with_centroids.0.39.csv')

In [80]:
type_labels = pd.read_csv('../output/00_100p_tree/uclust_all_1_rubisco_types.csv')
data_with_types = collapsed_auto_90p.merge(type_labels,left_on='Query',right_on='ID')


labeled_leaves = data_with_types[~pd.isna(data_with_types['type_y'])]
color_map = {'I': '#28B463',
             'II': '#E74C3C',
             'II/III':'#AF7AC5',
             'IIIa':'#AED6F1',
             'IIIb':'#3498DB',
             'IIIc':'#1F618D',
             'IIIlike':'#5D6D7E',
             'IV':'#F4D03F',
             'IVlike':'#F8C471',
             'IV-outgroup':'#F8C471',
             'unknown':'#F442D4'}
lines = labeled_leaves['Centroid'].apply(lambda x: x.split(' ')[0]).values +[',label,node,'+color_map[x]+',1,normal\n' for x in labeled_leaves['type_y'].values]
lines = set(lines)
with open('../data/itol_legend_template.txt','r') as file:
    with open('../output/03_branch_length_cluster/type_legend.txt', "w") as f1:
        for row in file:
            f1.write(row)
        for line in lines:
            f1.write(line)
        file.close()
        f1.close()


In [25]:
kinetic_data = pd.read_csv('../output/00_100p_tree/uclust_all_1_kinetic_data.csv',names=['kinetic_ID'])
synth_data = pd.read_csv('../output/00_100p_tree/synth_data.csv',names=['syn_ID'])

data_with_kin = collapsed_auto_90p.merge(kinetic_data, left_on='Query', right_on='kinetic_ID',how='outer')
data_with_kin = data_with_kin.merge(synth_data, left_on='Query', right_on='syn_ID',how='outer')

data_with_kin['kinetic_flag'] = '-1'
data_with_kin['syn_flag'] = '-1'

kinetic_centroid = data_with_kin.loc[~pd.isna(data_with_kin['kinetic_ID']),'Centroid'].unique()
syn_centroid = data_with_kin.loc[~pd.isna(data_with_kin['syn_ID']),'Centroid'].unique()
data_with_kin.loc[data_with_kin['Centroid'].isin(kinetic_centroid),'kinetic_flag'] = '1'
data_with_kin.loc[data_with_kin['Centroid'].isin(syn_centroid),'syn_flag'] = '1'
#data_with_label = data_with_kin[(data_with_kin['Centroid'].isin(kinetic_centroid)) | (data_with_kin['Centroid'].isin(syn_centroid))]
data_with_label = data_with_kin[~data_with_kin.Centroid.isna()]
lines = data_with_label['Centroid'].apply(lambda x: x.split(' ')[0]).values + ','+ data_with_label['kinetic_flag'].values+','+data_with_label['syn_flag'].values+'\n'

unique_lines = np.unique(lines)
with open('../data/kinetic_sampling_legend.txt','r') as file:
    with open('../output/03_branch_length_cluster/kinetic_legend.txt', "w") as f1:
        for row in file:
            f1.write(row)
        for line in lines:
            f1.write(line)
        file.close()
        f1.close()


In [24]:
add_type(type_file='../output/03_branch_length_cluster/90_label_type_id.csv',
         seq_file='../output/02_90p_autotrophic_rubisco_tree/auto_uclust_all_0.9_no_outliers.csv',
         outfile='../output/03_branch_length_cluster/type_legend.txt')

In [89]:
data_with_label

Unnamed: 0.1,Unnamed: 0,Type_70,Cluster_70,Size_70,%Id_70,Strand_70,Qlo_70,Tlo_70,Alignment_70,Query,...,Label,subtype,domain,leaf_id,cluster_id,Centroid,kinetic_ID,syn_ID,kinetic_flag,syn_flag
0,0.0,H,1.0,486.0,74.2,.,0,0,2I472M14D,RBCSeed_18 gi|563352309|gb|AHB41464.1|,...,,,,RBCSeed_18,150,RBCSeed_15,,RBCSeed_18 gi|563352309|gb|AHB41464.1|,-1,1
1,48.0,H,1.0,485.0,73.9,.,0,0,2I254MI217M14D,RAAC1_SR1_1_647,...,,,,RBCSeed_18,150,RBCSeed_15,,,-1,1
2,1.0,H,1.0,479.0,85.4,.,0,0,474M5D,RBC4_4 gi|504865161|ref|WP_015052263.1|,...,,,,RBC4_4,150,RBCSeed_15,,RBC4_4 gi|504865161|ref|WP_015052263.1|,-1,1
3,2.0,H,1.0,477.0,83.3,.,0,0,I473M4D,RBC4_6 gi|503663498|ref|WP_013897574.1|,...,,,,RBC4_6,150,RBCSeed_15,,RBC4_6 gi|503663498|ref|WP_013897574.1|,-1,1
4,3.0,H,1.0,485.0,75.3,.,0,0,4D474M7D,gi|1572541161|emb|VDS11207.1| RuBisCO long cha...,...,,,,gi|1572541161|emb|VDS11207.1|,150,RBCSeed_15,,,-1,1
5,4.0,H,1.0,485.0,75.5,.,0,0,4D474M7D,gi|1572541123|emb|VDS11188.1| RuBisCO long cha...,...,,,,gi|1572541161|emb|VDS11207.1|,150,RBCSeed_15,,,-1,1
6,27.0,H,1.0,485.0,74.9,.,0,0,4D474M7D,RBCSeed_7 gi|1101080583|gb|OIO40810.1|,...,,,,gi|1572541161|emb|VDS11207.1|,150,RBCSeed_15,,RBCSeed_7 gi|1101080583|gb|OIO40810.1|,-1,1
7,29.0,H,1.0,485.0,75.1,.,0,0,4D474M7D,gi|1572541151|emb|VDS11202.1| RuBisCO long cha...,...,,,,gi|1572541161|emb|VDS11207.1|,150,RBCSeed_15,,,-1,1
8,30.0,H,1.0,485.0,75.3,.,0,0,4D474M7D,gi|1572541141|emb|VDS11197.1| RuBisCO long cha...,...,,,,gi|1572541161|emb|VDS11207.1|,150,RBCSeed_15,,,-1,1
9,36795.0,H,606.0,312.0,70.6,.,0,0,161I194M29D86M3D,gi|1572539644|emb|VDD88908.1| RuBisCO long cha...,...,,,,gi|1572541161|emb|VDS11207.1|,150,RBCSeed_15,,,-1,1
