In [101]:
import os, shutil
import pandas as pd
from Bio import SeqIO

In [102]:
!cat ../data/milo_synthetized_rubisco.faa ../data/RuBisCO.300-700.faa ../data/flamholz_et_al_2019_kinetically_characterized.faa ../data/erb_et_al_2012_rubisco_superfamilies.faa > ../data/merged_data.faa

In [103]:
fasta = r"../data/merged_data.faa"

# Linux - Change bin
uclust = "../bin/usearch11.0.667_i86linux32 -cluster_fast %s -id 0.7 -uc ../output/01_uclust_all_0.7.uc" %fasta

# Windows
#uclust = r"..\bin\usearch11.0.667_win32.exe -cluster_fast %s -id 0.7 -uc ..\output\01_uclust_all_0.7.uc" %fasta

In [104]:
os.system(uclust) 

0

In [105]:
header = ['Type','Cluster','Size','%Id','Strand','Qlo','Tlo','Alignment','Query','Target']
uclust = pd.read_csv(r"../output/01_uclust_all_0.7.uc", sep='\t', names=header, index_col=False)
centroids = uclust[uclust['Type']=='C']
c_list = centroids.iloc[:,8].values
c_list = [c.split(" ")[0] for c in c_list]

sequences = []
for record in SeqIO.parse(fasta, "fasta"):
    if record.id in c_list:
        sequences.append(record)
        
with open(r"../output/01_uclust_all_0.7.faa", "w") as output_handle:
    SeqIO.write(sequences, output_handle, "fasta")

uclust.to_csv(r'../output/01_uclust_all_0.7.csv',index=False)

In [106]:
!../bin/mafft-linux64/mafft.bat ../output/01_uclust_all_0.7.faa > ../output/02_uclust_all_0.7.aln

nthread = 0
nthreadpair = 0
nthreadtb = 0
ppenalty_ex = 0
stacksize: 8192 kb
rescale = 1
Gap Penalty = -1.53, +0.00, +0.00



Making a distance matrix ..

There are 39 ambiguous characters.
  401 / 471
done.

Constructing a UPGMA tree (efffree=0) ... 
  460 / 471
done.

Progressive alignment 1/2... 
STEP   323 / 470 
Reallocating..done. *alloclen = 2525
STEP   467 / 470  h
Reallocating..done. *alloclen = 3531
STEP   470 / 470  h
done.

Making a distance matrix from msa.. 
  400 / 471
done.

Constructing a UPGMA tree (efffree=1) ... 
  460 / 471
done.

Progressive alignment 2/2... 
STEP   294 / 470 
Reallocating..done. *alloclen = 2482
STEP   469 / 470  h
Reallocating..done. *alloclen = 3537
STEP   470 / 470  h
done.

disttbfast (aa) Version 7.427
alg=A, model=BLOSUM62, 1.53, -0.00, -0.00, noshift, amax=0.0
0 thread(s)


Strategy:
 FFT-NS-2 (Fast but rough)
 Progressive method (guide trees were built 2 times.)

If unsure which option to use, try 'mafft --auto input > output'.
For more i

In [107]:
!../bin/FastTree -spr 4 -mlacc 2 -slownni ../output/02_uclust_all_0.7.aln > ../output/03_uclust_all_0.7.nwk

FastTree Version 2.1.10 SSE3
Alignment: ../output/02_uclust_all_0.7.aln
Amino acid distances: BLOSUM45 Joins: balanced Support: SH-like 1000
Search: Normal +NNI +SPR (4 rounds range 10) +ML-NNI opt-each=2
TopHits: 1.00*sqrtN close=default refresh=0.80
ML Model: Jones-Taylor-Thorton, CAT approximation with 20 rate categories
Ignored unknown character B (seen 5 times)
Ignored unknown character X (seen 39 times)
Ignored unknown character Z (seen 4 times)
Initial topology in 0.84 seconds
Refining topology: 36 rounds ME-NNIs, 4 rounds ME-SPRs, 18 rounds ML-NNIs
Total branch-length 131.340 after 10.59 sec
ML-NNI round 1: LogLk = -290697.032 NNIs 84 max delta 26.06 Time 21.85
Switched to using 20 rate categories (CAT approximation)
Rate categories were divided by 0.917 so that average rate = 1.0
CAT-based log-likelihoods may not be comparable across runs
Use -gamma for approximate but comparable Gamma(20) log-likelihoods
ML-NNI round 2: LogLk = -276204.498 NNIs 66 max delta 15.95 Time 34.17
M

In [99]:
labels = pd.read_csv('../data/verified_rubisco_types.csv')
uclust_data = pd.read_csv('../output/01_uclust_all_0.7.csv')
uclust_data = uclust_data[uclust_data['Type'] !='S']
uclust_data.loc[uclust_data['Target'] == '*','Target'] = uclust_data.loc[uclust_data['Target'] == '*','Query']
uclust_data = uclust_data.merge(labels, left_on='Query', right_on='ID',how='left')
labeled_leaves = uclust_data[~pd.isna(uclust_data['type'])]
#labels[~labels['ID'].isin(uclust_data['ID'])]
#labels
color_map = {'1': '#a0bc5d', '2': '#1d69d2', '3':'#ff0000','2_3':'#ffbf00'}
lines = labeled_leaves['Target'].apply(lambda x: x.split(' ')[0]).values +[',label,node,'+color_map[x]+',1,normal\n' for x in labeled_leaves['type'].values]
#lines  

In [34]:
with open('../data/itol_legend_template.txt','r') as file:
    with open("../output/type_legend.txt", "w") as f1:
        for row in file:
            f1.write(row)
        for line in lines:
            f1.write(line)
        file.close()
        f1.close()

In [100]:
flamholz_data = pd.DataFrame([x.description for x in SeqIO.parse('../data/flamholz_et_al_2019_kinetically_characterized.faa', "fasta")],columns=['kinetic_ID'])
uclust_data = uclust_data.merge(flamholz_data, left_on='Query', right_on='kinetic_ID',how='left')
kinetic_measured = uclust_data[~pd.isna(uclust_data['kinetic_ID'])]
lines = kinetic_measured['Target'].apply(lambda x: x.split(' ')[0]).values + ',1,-1\n'

synth_data = pd.DataFrame([x.description for x in SeqIO.parse('../data/milo_synthetized_rubisco.faa', "fasta")],columns=['syn_ID'])
uclust_data = uclust_data.merge(flamholz_data, left_on='Query', right_on='syn_ID',how='left')
synth = uclust_data[~pd.isna(uclust_data['syn_ID'])]
lines_synth = synth['Target'].apply(lambda x: x.split(' ')[0]).values + ',-1,1\n'

with open('../data/kinetic_sampling_legend.txt','r') as file:
    with open("../output/kinetic_legend.txt", "w") as f1:
        for row in file:
            f1.write(row)
        for line in lines:
            f1.write(line)
        for line in lines_synth:
            f1.write(line)
        file.close()
        f1.close()

In [91]:
uclust_data[~pd.isna(uclust_data['kinetic_ID'])]
#uclust_data.loc[uclust_data['Query'].str.contains('AAA26115.1'),'Query'].loc[47373]
#flamholz_data.loc[0,'kinetic_ID']

Unnamed: 0,Type,Cluster,Size,%Id,Strand,Qlo,Tlo,Alignment,Query,Target,ID,type,comments,kinetic_ID
158,H,26,477,84.2,.,0,0,I475M2D,"AFV62913.1 ribulose-1,5-bisphosphate carboxyla...",gi|518331021|ref|WP_019501228.1| ribulose-bisp...,,,,"AFV62913.1 ribulose-1,5-bisphosphate carboxyla..."
290,H,26,476,82.8,.,0,0,I468MD7M,"NP_043033.1 ribulose-1,5-bisphosphate carboxyl...",gi|518331021|ref|WP_019501228.1| ribulose-bisp...,,,,"NP_043033.1 ribulose-1,5-bisphosphate carboxyl..."
551,H,26,482,83.4,.,0,0,6D476M,"ARJ58803.1 ribulose-1,5-bisphosphate carboxyla...",gi|518331021|ref|WP_019501228.1| ribulose-bisp...,,,,"ARJ58803.1 ribulose-1,5-bisphosphate carboxyla..."
940,H,26,477,82.9,.,0,0,I475M2D,"YP_009267237.1 ribulose-1,5-bisphosphate carbo...",gi|518331021|ref|WP_019501228.1| ribulose-bisp...,,,,"YP_009267237.1 ribulose-1,5-bisphosphate carbo..."
1306,H,26,472,75.6,.,0,0,9I467M5D,"AAA23328.1 ribulose-1,5-bisphosphate carboxyla...",gi|518331021|ref|WP_019501228.1| ribulose-bisp...,,,,"AAA23328.1 ribulose-1,5-bisphosphate carboxyla..."
1694,H,26,485,84.8,.,0,0,I475M10D,"ADW80658.1 ribulose-1,5-bisphosphate carboxyla...",gi|518331021|ref|WP_019501228.1| ribulose-bisp...,,,,"ADW80658.1 ribulose-1,5-bisphosphate carboxyla..."
1993,H,26,475,84.2,.,0,0,I475M,"ADK47564.1 ribulose-1,5-bisphosphate carboxyla...",gi|518331021|ref|WP_019501228.1| ribulose-bisp...,,,,"ADK47564.1 ribulose-1,5-bisphosphate carboxyla..."
3349,H,26,480,84.4,.,0,0,I475M5D,"AAK14845.1 ribulose-1,5-bisphosphate carboxyla...",gi|518331021|ref|WP_019501228.1| ribulose-bisp...,,,,"AAK14845.1 ribulose-1,5-bisphosphate carboxyla..."
6928,H,99,475,75.6,.,0,0,134M60D35M12D234M287I,"SCM15150.1 Large subunit of Ribulose-1,5-bisph...",gi|223541024|gb|EEF42581.1| conserved hypothet...,,,,"SCM15150.1 Large subunit of Ribulose-1,5-bisph..."
7139,H,26,477,83.6,.,0,0,I475M2D,"ABK79588.1 ribulose-1,5-bisphosphate carboxyla...",gi|518331021|ref|WP_019501228.1| ribulose-bisp...,,,,"ABK79588.1 ribulose-1,5-bisphosphate carboxyla..."
