In [1]:
import os, shutil
import pandas as pd
from Bio import SeqIO
import numpy as np
from Bio import AlignIO
from collections import Counter
from Bio import Seq

# 01 - merge fasta files and cluster sequences

In [2]:
sequences = []
for record in SeqIO.parse('../data/jaffe_et_al_2018_rubisco_superfamilies.faa', "fasta"):
    record.description = record.description.split(':')[0]
    record.id = record.id.split(':')[0]
    record.name = record.name.split(':')[0]
    sequences.append(record)
        
with open(r"../data/jaffe_et_al_2018_rubisco_superfamilies_clean.faa", "w") as output_handle:
    SeqIO.write(sequences, output_handle, "fasta")

In [227]:
#!cat ../data/milo_synthetized_rubisco.faa ../data/RuBisCO.300-700.faa ../data/flamholz_et_al_2019_kinetically_characterized.faa ../data/erb_et_al_2012_rubisco_superfamilies.faa > ../data/merged_data.faa
!cat ../data/milo_synthetized_rubisco.faa ../data/RuBisCO.300-700.faa ../data/flamholz_et_al_2019_kinetically_characterized.faa ../data/jaffe_et_al_2018_rubisco_superfamilies_clean.faa > ../data/merged_data.faa

In [228]:
sequences = []
for record in SeqIO.parse('../data/merged_data.faa', "fasta"):
    record.seq = record.seq.split('*')[0]
    sequences.append(record)
        
with open(r"../data/merged_data.faa", "w") as output_handle:
    SeqIO.write(sequences, output_handle, "fasta")


In [229]:
fasta = r"../data/merged_data.faa"

# Linux - Change bin
uclust = "../bin/usearch11.0.667_i86linux32 -cluster_fast %s -id 0.7 -uc ../output/01_uclust_all_0.7.uc" %fasta

# Windows
#uclust = r"..\bin\usearch11.0.667_win32.exe -cluster_fast %s -id 0.7 -uc ..\output\01_uclust_all_0.7.uc" %fasta

In [230]:
os.system(uclust) 

0

In [292]:
header = ['Type','Cluster','Size','%Id','Strand','Qlo','Tlo','Alignment','Query','Target']
uclust = pd.read_csv(r"../output/01_uclust_all_0.7.uc", sep='\t', names=header, index_col=False)
centroids = uclust[uclust['Type']=='C']
c_list = centroids.iloc[:,8].values
c_list = [c.split(" ")[0] for c in c_list]

sequences = []
for record in SeqIO.parse(fasta, "fasta"):
    if record.id in c_list:
        sequences.append(record)

seq2 = []
seq3 = []
for i,record in enumerate(sequences):
    if not record.id in seq2: 
        seq2.append(record.id)
        seq3.append(record)

        
with open(r"../output/01_uclust_all_0.7.faa", "w") as output_handle:
    SeqIO.write(seq3, output_handle, "fasta")

uclust.to_csv(r'../output/01_uclust_all_0.7.csv',index=False)

# 02 - Create multiple sequence alignment

In [293]:
!../bin/mafft-linux64/mafft.bat ../output/01_uclust_all_0.7.faa > ../output/02_uclust_all_0.7.aln

nthread = 0
nthreadpair = 0
nthreadtb = 0
ppenalty_ex = 0
stacksize: 8192 kb
rescale = 1
Gap Penalty = -1.53, +0.00, +0.00



Making a distance matrix ..

There are 57 ambiguous characters.
  601 / 601
done.

Constructing a UPGMA tree (efffree=0) ... 
  590 / 601
done.

Progressive alignment 1/2... 
STEP   414 / 600 
Reallocating..done. *alloclen = 2430
STEP   501 / 600  h
Reallocating..done. *alloclen = 3611

done.

Making a distance matrix from msa.. 
  500 / 601
done.

Constructing a UPGMA tree (efffree=1) ... 
  590 / 601
done.

Progressive alignment 2/2... 
STEP   401 / 600 
Reallocating..done. *alloclen = 2482
STEP   501 / 600 
Reallocating..done. *alloclen = 3519

done.

disttbfast (aa) Version 7.427
alg=A, model=BLOSUM62, 1.53, -0.00, -0.00, noshift, amax=0.0
0 thread(s)


Strategy:
 FFT-NS-2 (Fast but rough)
 Progressive method (guide trees were built 2 times.)

If unsure which option to use, try 'mafft --auto input > output'.
For more information, see 'mafft --help', 'mafft -

# 03 - clean MSA to contain only positions with more than 5% coverage (based on Jaffe et al. 2018)

In [12]:
#aln = AlignIO.read('../output/02_uclust_all_0.7.aln',format='fasta')

aln = AlignIO.read('../data/jaffe_et_al_2018_rubisco_superfamilies.aln',format='fasta')
mask = []
for i in range(0,aln.get_alignment_length()):
    mask.append(Counter(aln[:,i])['-']/len(aln[:,i])<0.95)
    
np_aln = np.array(aln)
np_aln = np_aln[:,mask]

aln_faa = SeqIO.parse('../data/jaffe_et_al_2018_rubisco_superfamilies.aln',format='fasta')
sequences = []
for record,i in zip(aln_faa,range(0,len(aln))):
    record.seq = Seq.Seq("".join(np_aln[i,:]), Seq.Alphabet.SingleLetterAlphabet())
    f = record.id.split(":")[0]
    record.id = f
    sequences.append(record)
        
with open(r"../output/jaffe_et_al_2018_rubisco_superfamilies_trimmed.aln", "w") as output_handle:
    SeqIO.write(sequences, output_handle, "fasta")


In [14]:
# Status
#!curl -u yinonbaron:Mchcav11~ -H cipres-appkey:rubisco_phylogeny-49F87B124F3D429FBE12F95E4254DDEA https://cipresrest.sdsc.edu/cipresrest/v1/job/yinonbaron/NGBW-JOB-RAXMLHPC8_REST_XSEDE-3552B7E1CC8345EA97221BBD8C26649B
# Results
#!curl -u yinonbaron:Mchcav11~ -H cipres-appkey:rubisco_phylogeny-49F87B124F3D429FBE12F95E4254DDEA https://cipresrest.sdsc.edu/cipresrest/v1/job/yinonbaron/NGBW-JOB-RAXMLHPC8_REST_XSEDE-3552B7E1CC8345EA97221BBD8C26649B/output

# Download
#!curl -u yinonbaron:Mchcav11~ -H cipres-appkey:rubisco_phylogeny-49F87B124F3D429FBE12F95E4254DDEA https://cipresrest.sdsc.edu/cipresrest/v1/job/yinonbaron/NGBW-JOB-RAXMLHPC8_REST_XSEDE-3552B7E1CC8345EA97221BBD8C26649B/output/971441

#Most up to dat
!curl -u yinonbaron:Mchcav11~ -H cipres-appkey:rubisco_phylogeny-49F87B124F3D429FBE12F95E4254DDEA https://cipresrest.sdsc.edu/cipresrest/v1/job/yinonbaron -F tool=RAXMLHPC8_REST_XSEDE  -F input.infile_=@../output/jaffe_et_al_2018_rubisco_superfamilies_trimmed.aln -F metadata.statusEmail=true -F vparam.datatype_=protein -F vparam.runtime_=160 -F  vparam.select_analysis_=fa -F vparam.choose_bootstrap_=x -F vparam.printbrlength_=1

<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<jobstatus>
    <selfUri>
        <url>https://cipresrest.sdsc.edu/cipresrest/v1/job/yinonbaron/NGBW-JOB-RAXMLHPC8_REST_XSEDE-21903929C1EA4B66B87B7C51CD2FB68B</url>
        <rel>jobstatus</rel>
        <title>NGBW-JOB-RAXMLHPC8_REST_XSEDE-21903929C1EA4B66B87B7C51CD2FB68B</title>
    </selfUri>
    <jobHandle>NGBW-JOB-RAXMLHPC8_REST_XSEDE-21903929C1EA4B66B87B7C51CD2FB68B</jobHandle>
    <jobStage>QUEUE</jobStage>
    <terminalStage>false</terminalStage>
    <failed>false</failed>
    <metadata>
        <entry>
            <key>statusEmail</key>
            <value>true</value>
        </entry>
    </metadata>
    <dateSubmitted>2019-04-07T05:19:29-07:00</dateSubmitted>
    <resultsUri>
        <url>https://cipresrest.sdsc.edu/cipresrest/v1/job/yinonbaron/NGBW-JOB-RAXMLHPC8_REST_XSEDE-21903929C1EA4B66B87B7C51CD2FB68B/output</url>
        <rel>results</rel>
        <title>Job Results</title>
    </resultsUri>
    <workingDirUri>
     

In [None]:
# 04 - Create tree

In [295]:
!../bin/FastTree -spr 4 -mlacc 2 -slownni ../output/03_uclust_all_0.7_trimmed.aln > ../output/04_uclust_all_0.7.nwk

FastTree Version 2.1.10 SSE3
Alignment: ../output/03_uclust_all_0.7_trimmed.aln
Amino acid distances: BLOSUM45 Joins: balanced Support: SH-like 1000
Search: Normal +NNI +SPR (4 rounds range 10) +ML-NNI opt-each=2
TopHits: 1.00*sqrtN close=default refresh=0.80
ML Model: Jones-Taylor-Thorton, CAT approximation with 20 rate categories
Ignored unknown character B (seen 4 times)
Ignored unknown character X (seen 41 times)
Ignored unknown character Z (seen 4 times)
Initial topology in 0.60 seconds
Refining topology: 37 rounds ME-NNIs, 4 rounds ME-SPRs, 18 rounds ML-NNIs
Total branch-length 158.296 after 7.28 sec
ML-NNI round 1: LogLk = -341214.872 NNIs 108 max delta 16.87 Time 12.48
Switched to using 20 rate categories (CAT approximation)
Rate categories were divided by 1.307 so that average rate = 1.0
CAT-based log-likelihoods may not be comparable across runs
Use -gamma for approximate but comparable Gamma(20) log-likelihoods
ML-NNI round 2: LogLk = -323350.811 NNIs 45 max delta 17.74 Time

In [5]:
labels = pd.read_csv('../data/jaffe_et_al_2018_rubisco_types.csv')
labels

Unnamed: 0,ID,Organism,type
0,2_Hydrogenovibrio_marinus_BAD1531_REF,reference,I
1,AAB41464_Aurantimonas_manganoxydans_SI85_I_REF,reference,I
2,ABA23512_Anabaena_variabilis_ATCC_29413_I_REF,reference,I
3,ABA56859_Nitrosococcus_oceani_ATCC_19707_I_REF,reference,I
4,Acidimicrobium_ferrooxidans_YP_00310876_REF,reference,I
5,Alkalilimnicola_ehrlichii_YP_74366_REF,reference,I
6,Allochromatium_vinosum_YP_00344469_REF,reference,I
7,Azoarcus_sp_KH32C_BAL2711_REF,reference,I
8,Bradyrhizobium_sp_ORS_278_YP_00120434_REF,reference,I
9,Burkholderia_xenovorans_YP_55288_REF,reference,I


In [79]:
labels = pd.read_csv('../data/jaffe_et_al_2018_rubisco_types.csv')
uclust_data = pd.read_csv('../output/01_uclust_all_0.7.csv')
uclust_data = uclust_data[uclust_data['Type'] !='S']
uclust_data.loc[uclust_data['Target'] == '*','Target'] = uclust_data.loc[uclust_data['Target'] == '*','Query']
uclust_data = uclust_data.merge(labels, left_on='Query', right_on='ID',how='left')
labeled_leaves = uclust_data[~pd.isna(uclust_data['type'])]
#labels[~labels['ID'].isin(uclust_data['ID'])]
#labels
#color_map = {'1': '#a0bc5d', '2': '#1d69d2', '3':'#ff0000','2_3':'#ffbf00'}
color_map = {'I': '#28B463', 'II': '#E74C3C', 'II/III':'#AF7AC5','IIIa':'#AED6F1','IIIb':'#3498DB','IIIc':'#1F618D','IIIlike':'#5D6D7E','IV':'#F4D03F','IVlike':'#F8C471','unknown':'#17202A'}
lines = labeled_leaves['Target'].apply(lambda x: x.split(' ')[0]).values +[',label,node,'+color_map[x]+',1,normal\n' for x in labeled_leaves['type'].values]
lines = set(lines)

In [327]:
uclust_data[uclust_data['type'] == 'II']
#labeled_leaves

Unnamed: 0,Type,Cluster,Size,%Id,Strand,Qlo,Tlo,Alignment,Query,Target,ID,Organism,type
30,H,4,459,80.6,.,0,0,459M,ABB41020_Thiomicrospira_crunogena_XCL_2_II__REF,RBC3_1 WP_012823967.1,ABB41020_Thiomicrospira_crunogena_XCL_2_II__REF,reference,II
6974,H,4,451,78.0,.,0,0,6I451M2I,AAC38280_Riftia_pachyptila_endosymbiont_I_REF,RBC3_1 WP_012823967.1,AAC38280_Riftia_pachyptila_endosymbiont_I_REF,reference,II
6975,H,2,451,76.0,.,0,0,9I73M3I378M2I,P50922_Rhodobacter_capsulatus_ATCC11166_I_REF,RBCSeed_25 gi|771607269|ref|WP_045218222.1|,P50922_Rhodobacter_capsulatus_ATCC11166_I_REF,reference,II
6976,H,4,451,90.2,.,0,0,6I451M2I,Thiobacillus_denitrificans_AAA9917_REF,RBC3_1 WP_012823967.1,Thiobacillus_denitrificans_AAA9917_REF,reference,II
13947,H,175,452,71.0,.,0,0,6I30M7I422M3I,AAN52766_Rhodopseudomonas_palustris_I_REF,RBC_22 gi|269148484|gb|ACZ28625.1|,AAN52766_Rhodopseudomonas_palustris_I_REF,reference,II
13948,H,4,451,100.0,.,0,0,6I451M2I,Halothiobacillus_neapolitanus_YP_00326297_REF,RBC3_1 WP_012823967.1,Halothiobacillus_neapolitanus_YP_00326297_REF,reference,II
13949,H,4,451,89.4,.,0,0,6I451M2I,YP002220242_Acidithiobacillus_ferrooxidans_ATC...,RBC3_1 WP_012823967.1,YP002220242_Acidithiobacillus_ferrooxidans_ATC...,reference,II
21071,H,7,462,85.1,.,0,0,6I25MD436M18I,AAG37859_Symbiodinium_sp_I_REF,RBC_16 gi|84029424|sp|Q42813.2|RBL2_LINPO,AAG37859_Symbiodinium_sp_I_REF,reference,II
21072,H,4,451,84.3,.,0,0,6I451M2I,YP_522655_Rhodoferax_ferrireducens_T118_I_REF,RBC3_1 WP_012823967.1,YP_522655_Rhodoferax_ferrireducens_T118_I_REF,reference,II
28190,H,18,462,87.0,.,0,0,6I76M4D382M2I,Mariprofundus_ferrooxydans_ZP_0145121_REF,"gi|657350008|ref|WP_029406688.1| ribulose 1,5-...",Mariprofundus_ferrooxydans_ZP_0145121_REF,reference,II


In [80]:
with open('../data/itol_legend_template.txt','r') as file:
    with open("../output/type_legend.txt", "w") as f1:
        for row in file:
            f1.write(row)
        for line in lines:
            f1.write(line)
        file.close()
        f1.close()

In [81]:
flamholz_data = pd.DataFrame([x.description for x in SeqIO.parse('../data/flamholz_et_al_2019_kinetically_characterized.faa', "fasta")],columns=['kinetic_ID'])
uclust_data = uclust_data.merge(flamholz_data, left_on='Query', right_on='kinetic_ID',how='left')
kinetic_measured = uclust_data[~pd.isna(uclust_data['kinetic_ID'])]
lines = kinetic_measured['Target'].apply(lambda x: x.split(' ')[0]).values + ',1,-1\n'

synth_data = pd.DataFrame([x.description for x in SeqIO.parse('../data/milo_synthetized_rubisco.faa', "fasta")],columns=['syn_ID'])
uclust_data = uclust_data.merge(synth_data, left_on='Query', right_on='syn_ID',how='left')
synth = uclust_data[~pd.isna(uclust_data['syn_ID'])]
lines_synth = synth['Target'].apply(lambda x: x.split(' ')[0]).values + ',-1,1\n'

with open('../data/kinetic_sampling_legend.txt','r') as file:
    with open("../output/kinetic_legend.txt", "w") as f1:
        for row in file:
            f1.write(row)
        for line in lines:
            f1.write(line)
        for line in lines_synth:
            f1.write(line)
        file.close()
        f1.close()

In [91]:
uclust_data[~pd.isna(uclust_data['kinetic_ID'])]
#uclust_data.loc[uclust_data['Query'].str.contains('AAA26115.1'),'Query'].loc[47373]
#flamholz_data.loc[0,'kinetic_ID']

Unnamed: 0,Type,Cluster,Size,%Id,Strand,Qlo,Tlo,Alignment,Query,Target,ID,type,comments,kinetic_ID
158,H,26,477,84.2,.,0,0,I475M2D,"AFV62913.1 ribulose-1,5-bisphosphate carboxyla...",gi|518331021|ref|WP_019501228.1| ribulose-bisp...,,,,"AFV62913.1 ribulose-1,5-bisphosphate carboxyla..."
290,H,26,476,82.8,.,0,0,I468MD7M,"NP_043033.1 ribulose-1,5-bisphosphate carboxyl...",gi|518331021|ref|WP_019501228.1| ribulose-bisp...,,,,"NP_043033.1 ribulose-1,5-bisphosphate carboxyl..."
551,H,26,482,83.4,.,0,0,6D476M,"ARJ58803.1 ribulose-1,5-bisphosphate carboxyla...",gi|518331021|ref|WP_019501228.1| ribulose-bisp...,,,,"ARJ58803.1 ribulose-1,5-bisphosphate carboxyla..."
940,H,26,477,82.9,.,0,0,I475M2D,"YP_009267237.1 ribulose-1,5-bisphosphate carbo...",gi|518331021|ref|WP_019501228.1| ribulose-bisp...,,,,"YP_009267237.1 ribulose-1,5-bisphosphate carbo..."
1306,H,26,472,75.6,.,0,0,9I467M5D,"AAA23328.1 ribulose-1,5-bisphosphate carboxyla...",gi|518331021|ref|WP_019501228.1| ribulose-bisp...,,,,"AAA23328.1 ribulose-1,5-bisphosphate carboxyla..."
1694,H,26,485,84.8,.,0,0,I475M10D,"ADW80658.1 ribulose-1,5-bisphosphate carboxyla...",gi|518331021|ref|WP_019501228.1| ribulose-bisp...,,,,"ADW80658.1 ribulose-1,5-bisphosphate carboxyla..."
1993,H,26,475,84.2,.,0,0,I475M,"ADK47564.1 ribulose-1,5-bisphosphate carboxyla...",gi|518331021|ref|WP_019501228.1| ribulose-bisp...,,,,"ADK47564.1 ribulose-1,5-bisphosphate carboxyla..."
3349,H,26,480,84.4,.,0,0,I475M5D,"AAK14845.1 ribulose-1,5-bisphosphate carboxyla...",gi|518331021|ref|WP_019501228.1| ribulose-bisp...,,,,"AAK14845.1 ribulose-1,5-bisphosphate carboxyla..."
6928,H,99,475,75.6,.,0,0,134M60D35M12D234M287I,"SCM15150.1 Large subunit of Ribulose-1,5-bisph...",gi|223541024|gb|EEF42581.1| conserved hypothet...,,,,"SCM15150.1 Large subunit of Ribulose-1,5-bisph..."
7139,H,26,477,83.6,.,0,0,I475M2D,"ABK79588.1 ribulose-1,5-bisphosphate carboxyla...",gi|518331021|ref|WP_019501228.1| ribulose-bisp...,,,,"ABK79588.1 ribulose-1,5-bisphosphate carboxyla..."


In [23]:
f = open('../output/type1_fasttree.txt','r')
type1_id = [x.rstrip('\n').replace(' ','_') for x in f.readlines()]
aln = AlignIO.read('../output/03_uclust_all_0.7_trimmed.aln',format='fasta')
type1 = []
for record in aln:
    if record.id in type1_id:
        type1.append(record)
        
with open(r"../output/05_type1_fasttree.aln", "w") as output_handle:
    SeqIO.write(type1, output_handle, "fasta")


In [54]:
f = open('../output/type2_fasttree.txt','r')
type2_id = [x.rstrip('\n').replace(' ','_') for x in f.readlines()]
aln = AlignIO.read('../output/03_uclust_all_0.7_trimmed.aln',format='fasta')
type2 = []
for record in aln:
    if record.id in type2_id:
        type2.append(record)
        
with open(r"../output/05_type2_fasttree.aln", "w") as output_handle:
    SeqIO.write(type2, output_handle, "fasta")

In [57]:
f = open('../output/type2_3_fastree.txt','r')
type2_3_id = [x.rstrip('\n').replace(' ','_') for x in f.readlines()]
aln = AlignIO.read('../output/03_uclust_all_0.7_trimmed.aln',format='fasta')
type2_3 = []
for record in aln:
    if record.id in type2_3_id:
        type2_3.append(record)
        
with open(r"../output/05_type2_3_fasttree.aln", "w") as output_handle:
    SeqIO.write(type2_3, output_handle, "fasta")

In [61]:
f2 = open('../output/type2_fasttree.txt','r')
type2_id = [x.rstrip('\n').replace(' ','_') for x in f2.readlines()]

f1 = open('../output/type1_fasttree.txt','r')
type1_id = [x.rstrip('\n').replace(' ','_') for x in f1.readlines()]

f = open('../output/type2_3_fastree.txt','r')
type2_3_id = [x.rstrip('\n').replace(' ','_') for x in f.readlines()]

f4 = open('../output/type4_fasttree.txt','r')
type4_id = [x.rstrip('\n').replace(' ','_') for x in f4.readlines()]

aln = AlignIO.read('../output/03_uclust_all_0.7_trimmed.aln',format='fasta')
type3 = []
not_type4 = []
for record in aln:
    if record.id not in type4_id:
        not_type4.append(record)
    if record.id not in type1_id+type2_id+type2_3_id+type4_id:
        type3.append(record)
#with open(r"../output/05_type2_3_fasttree.aln", "w") as output_handle:
#    SeqIO.write(type2_3, output_handle, "fasta")

In [77]:
len(type1)


28

In [53]:
from Shannon import shannon_entropy_list_msa,shannon_entropy
aln1 = AlignIO.read('../output/05_type1_fasttree.aln',format='fasta')
sum(shannon_entropy_list_msa(aln1))


781.5564408080079