In [1]:
import os, shutil
import pandas as pd
from Bio import SeqIO

In [11]:
!cat ../data/RuBisCO.300-700.faa ../data/flamholz_et_al_2019_kinetically_characterized.faa ../data/erb_et_al_2012_rubisco_superfamilies.faa > ../data/merged_data.faa

In [12]:
fasta = r"../data/merged_data.faa"

# Linux - Change bin
uclust = "../bin/usearch11.0.667_i86linux32 -cluster_fast %s -id 0.7 -uc ../output/01_uclust_all_0.7.uc" %fasta

# Windows
#uclust = r"..\bin\usearch11.0.667_win32.exe -cluster_fast %s -id 0.7 -uc ..\output\01_uclust_all_0.7.uc" %fasta

In [13]:
os.system(uclust) 

0

In [14]:
header = ['Type','Cluster','Size','%Id','Strand','Qlo','Tlo','Alignment','Query','Target']
uclust = pd.read_csv(r"../output/01_uclust_all_0.7.uc", sep='\t', names=header, index_col=False)
centroids = uclust[uclust['Type']=='C']
c_list = centroids.iloc[:,8].values
c_list = [c.split(" ")[0] for c in c_list]

sequences = []
for record in SeqIO.parse(fasta, "fasta"):
    if record.id in c_list:
        sequences.append(record)
        
with open(r"../output/01_uclust_all_0.7.faa", "w") as output_handle:
    SeqIO.write(sequences, output_handle, "fasta")

uclust.to_csv(r'../output/01_uclust_all_0.7.csv',index=False)

In [16]:
!../bin/mafft-linux64/mafft.bat ../output/01_uclust_all_0.7.faa > ../output/02_uclust_all_0.7.aln

nthread = 0
nthreadpair = 0
nthreadtb = 0
ppenalty_ex = 0
stacksize: 8192 kb
rescale = 1
Gap Penalty = -1.53, +0.00, +0.00



Making a distance matrix ..

There are 39 ambiguous characters.
  401 / 470
done.

Constructing a UPGMA tree (efffree=0) ... 
  460 / 470
done.

Progressive alignment 1/2... 
STEP   321 / 469 
Reallocating..done. *alloclen = 2525
STEP   458 / 469 
Reallocating..done. *alloclen = 3534
STEP   469 / 469  h
done.

Making a distance matrix from msa.. 
  400 / 470
done.

Constructing a UPGMA tree (efffree=1) ... 
  460 / 470
done.

Progressive alignment 2/2... 
STEP   293 / 469 
Reallocating..done. *alloclen = 2482
STEP   462 / 469  h
Reallocating..done. *alloclen = 3519
STEP   469 / 469  h
done.

disttbfast (aa) Version 7.427
alg=A, model=BLOSUM62, 1.53, -0.00, -0.00, noshift, amax=0.0
0 thread(s)


Strategy:
 FFT-NS-2 (Fast but rough)
 Progressive method (guide trees were built 2 times.)

If unsure which option to use, try 'mafft --auto input > output'.
For more inf

In [19]:
!../bin/FastTree -spr 4 -mlacc 2 -slownni ../output/02_uclust_all_0.7.aln > ../output/03_uclust_all_0.7.nwk

FastTree Version 2.1.10 SSE3
Alignment: ../output/02_uclust_all_0.7.aln
Amino acid distances: BLOSUM45 Joins: balanced Support: SH-like 1000
Search: Normal +NNI +SPR (4 rounds range 10) +ML-NNI opt-each=2
TopHits: 1.00*sqrtN close=default refresh=0.80
ML Model: Jones-Taylor-Thorton, CAT approximation with 20 rate categories
Ignored unknown character B (seen 5 times)
Ignored unknown character X (seen 39 times)
Ignored unknown character Z (seen 4 times)
Initial topology in 0.87 seconds
Refining topology: 36 rounds ME-NNIs, 4 rounds ME-SPRs, 18 rounds ML-NNIs
Total branch-length 131.107 after 10.56 sec
ML-NNI round 1: LogLk = -289885.926 NNIs 89 max delta 26.13 Time 22.35
Switched to using 20 rate categories (CAT approximation)
Rate categories were divided by 0.916 so that average rate = 1.0
CAT-based log-likelihoods may not be comparable across runs
Use -gamma for approximate but comparable Gamma(20) log-likelihoods
ML-NNI round 2: LogLk = -275463.058 NNIs 57 max delta 19.31 Time 35.20
M

In [33]:
labels = pd.read_csv('../data/verified_rubisco_types.csv')
uclust_data = pd.read_csv('../output/01_uclust_all_0.7.csv')
uclust_data = uclust_data[uclust_data['Type'] !='S']
uclust_data.loc[uclust_data['Target'] == '*','Target'] = uclust_data.loc[uclust_data['Target'] == '*','Query']
uclust_data = uclust_data.merge(labels, left_on='Query', right_on='ID',how='left')
labeled_leaves = uclust_data[~pd.isna(uclust_data['type'])]
#labels[~labels['ID'].isin(uclust_data['ID'])]
#labels
color_map = {'1': '#a0bc5d', '2': '#1d69d2', '3':'#ff0000','2_3':'#ffbf00'}
lines = labeled_leaves['Target'].apply(lambda x: x.split(' ')[0]).values +[',label,node,'+color_map[x]+',1,normal\n' for x in labeled_leaves['type'].values]
lines  

array(['gi|37087672|sp|O58677.1|RBL_PYRHO,label,node,#ff0000,1,normal\n',
       'gi|37087672|sp|O58677.1|RBL_PYRHO,label,node,#ff0000,1,normal\n',
       'gi|505096769|ref|WP_015283871.1|,label,node,#ff0000,1,normal\n',
       'gi|523387592|emb|CDF56957.1|,label,node,#a0bc5d,1,normal\n',
       'gi|523387592|emb|CDF56957.1|,label,node,#a0bc5d,1,normal\n',
       'gi|518331021|ref|WP_019501228.1|,label,node,#a0bc5d,1,normal\n',
       'gi|1055947567|ref|WP_067489955.1|,label,node,#a0bc5d,1,normal\n',
       'gi|1055947567|ref|WP_067489955.1|,label,node,#a0bc5d,1,normal\n',
       'gi|703265142|emb|CDI73524.1|,label,node,#a0bc5d,1,normal\n',
       'gi|1055947567|ref|WP_067489955.1|,label,node,#a0bc5d,1,normal\n',
       'gi|1055947567|ref|WP_067489955.1|,label,node,#a0bc5d,1,normal\n',
       'RBG_16_Gammaproteobacteria_62_13__RBG_16_scaffold_11150_2,label,node,#1d69d2,1,normal\n',
       'gi|851308879|ref|WP_048172188.1|,label,node,#ff0000,1,normal\n',
       'gi|851308879|ref|WP_0481

In [34]:
with open('../data/itol_legend_template.txt','a+') as file:
    for line in lines:
        file.write(line)
    file.close()