In [2]:
import pandas as pd
from Bio import SeqIO, AlignIO, Seq
from pandas import np
from collections import Counter
from helper import *

### Preprocessing

The first thing we need to do is to remove places with more than one `_` from the ids because after moving to iTOL this will be a problem. We do this for the file  `RuBisCO.300-700.faa` and save it with the suffix 'processed'

#### Creating csv file from the data in Jaffe et al. and cleaning sequence ID from ":"

In [2]:
sequences = []
for record in SeqIO.parse('../data/jaffe_et_al_2018_rubisco_superfamilies.faa', "fasta"):
    record.description = record.description.split(':')[0]
    record.id = record.id.split(':')[0]
    record.name = record.name.split(':')[0]
    sequences.append(record)
!mkdir -p ../output/00_100p_tree
with open(r"../output/00_100p_tree/jaffe_et_al_2018_rubisco_superfamilies_clean.faa", "w") as output_handle:
    SeqIO.write(sequences, output_handle, "fasta")

#### Removing * from the end of the fasta file RuBisCO.300-700

In [3]:
sequences = []
for record in SeqIO.parse('../data/RuBisCO.300-700_processed.faa', "fasta"):
    record.seq = record.seq.rstrip(chars='*')
    sequences.append(record)
        
with open(r"../output/00_100p_tree/RuBisCO.300-700_clean.faa", "w") as output_handle:
    SeqIO.write(sequences, output_handle, "fasta")

### Merge fasta files and cluster sequences

In [4]:
#!cat ../data/milo_synthetized_rubisco.faa ../data/RuBisCO.300-700.faa ../data/flamholz_et_al_2019_kinetically_characterized.faa ../data/erb_et_al_2012_rubisco_superfamilies.faa > ../data/merged_data.faa
!cat ../data/milo_synthetized_rubisco.faa ../output/00_100p_tree/RuBisCO.300-700_clean.faa ../data/flamholz_et_al_2019_kinetically_characterized.faa ../output/00_100p_tree/jaffe_et_al_2018_rubisco_superfamilies_clean.faa > ../output/00_100p_tree/merged_data.faa

### remove sequences with ambiguous calls 

In [5]:
aa = ['A','R','N','D','C','G','Q','E','H','I','L','K','M','F','P','S','T','W','Y','V']
ambiguous_aa = ['B','Z','X','J']
canon = []
for record in SeqIO.parse('../output/00_100p_tree/merged_data.faa', "fasta"):
    no_of_aa = np.array([x in ambiguous_aa for x in record.seq]).sum()
    if no_of_aa == 0:
        canon.append(record)

with open(r"../output/00_100p_tree/merged_data_clean.faa", "w") as output_handle:
    SeqIO.write(canon, output_handle, "fasta")

### Cluster sequences based on 100 ID

In [7]:
!../bin/usearch11.0.667_i86linux32 -cluster_fast ../output/00_100p_tree/merged_data_clean.faa -id 1 -uc ../output/00_100p_tree/uclust_all_1.uc

usearch v11.0.667_i86linux32, 4.0Gb RAM (16.1Gb total), 4 cores
(C) Copyright 2013-18 Robert C. Edgar, all rights reserved.
https://drive5.com/usearch

License: yinonmoise.baron@weizmann.ac.il

00:00 73Mb    100.0% Reading ../output/00_100p_tree/merged_data_clean.faa
00:00 71Mb    100.0% DF
00:00 72Mb   49110 seqs, 48052 uniques, 47400 singletons (98.6%)
00:00 72Mb   Min size 1, median 1, max 48, avg 1.02
00:01 82Mb    100.0% DB
02:35 251Mb   100.0% 35413 clusters, max size 53, avg 1.4
                                                         
      Seqs  48052 (48.1k)
  Clusters  35413 (35.4k)
  Max size  53
  Avg size  1.4
  Min size  1
Singletons  28293 (28.3k), 58.9% of seqs, 79.9% of clusters
   Max mem  251Mb
      Time  02:34
Throughput  312.0 seqs/sec.



### Refine cluster centroids to include synthesized sequences

In [8]:
header = ['Type','Cluster','Size','%Id','Strand','Qlo','Tlo','Alignment','Query','Target']
uclust = pd.read_csv('../output/00_100p_tree/uclust_all_1.uc', sep='\t', names=header, index_col=False)
uclust = uclust[uclust['Type'] !='S']
uclust.loc[uclust['Target'] =='*','Target'] = uclust.loc[uclust['Target'] =='*','Query']

# Find synthetized sequences that are not centroids and replace them as centroids
synth_clusters = uclust[uclust['Query'].str.startswith('RBC') & ~uclust['Target'].str.startswith('RBC')]
for i in synth_clusters.iterrows():
    uclust.loc[uclust.Target== i[1].Target,'Target'] = i[1].Query

centroids = uclust[uclust['Type']=='C']
c_list = centroids.Target.values
c_list = [c.split(" ")[0] for c in c_list]

sequences = []
for record in SeqIO.parse('../output/00_100p_tree/merged_data_clean.faa', "fasta"):
    if record.id in c_list:
        sequences.append(record)

seq2 = []
seq3 = []
for i,record in enumerate(sequences):
    if not record.id in seq2: 
        seq2.append(record.id)
        seq3.append(record)
with open('../output/00_100p_tree/uclust_all_1.faa', "w") as output_handle:
    SeqIO.write(seq3, output_handle, "fasta")

uclust.to_csv('../output/00_100p_tree/uclust_all_1.csv',index=False)

### Adapt type legend to the new centroids

This will also throw some of the sequences in jaffe_2018 because of ambiguous amino acids

In [11]:
type_labels = pd.read_csv('../data/jaffe_et_al_2018_rubisco_types_processed.csv')
cluster_labels = uclust.merge(type_labels,left_on='Query',right_on='ID')[['Target','Organism','type']]
cluster_labels.columns = ['ID','Organism','type']
cluster_labels.to_csv('../output/00_100p_tree/uclust_all_1_rubisco_types.csv',index=False)

In [17]:
uclust = pd.read_csv('../output/00_100p_tree/uclust_all_1.csv')
# remove type IV rubisco of rubrum to avoid mischaracterization
uclust = uclust[~uclust.Query.str.contains('499709017')]

kin_data = pd.read_excel('../data/flamholz_et_al_kinetic_data_modified_20190611.xlsx','kinetic_data')
kin_data = kin_data[~kin_data.vC.isna()]
species = kin_data['Unnamed: 0'].apply(lambda x: " ".join(x.split(' ')[:2]))
species = species.unique()
species = pd.np.append(species,['Methanococcus jannaschii','Methanosarcina acetivorans','Eucalyptus'])
kin_list = []
for i in species:
    # Specific species which have both type I/IIIb and type IV
    if i == 'Rhodopseudomonas palustris':
        kin_list.append(uclust[uclust.Query.str.contains('90104852')].Target.values[0])
        continue
    if i == 'Allochromatium vinosum':
        kin_list.append(uclust[uclust.Query.str.contains('502735590')].Target.values[0])
        continue
    if i == 'Archaeoglobus fulgidus':
        kin_list.append(uclust[uclust.Query.str.contains('973026383')].Target.values[0])
        continue
    if i == 'Thiobacillus denitrificans':
        kin_list.append('gi|499632416|ref|WP_011313150.1|')
        continue
    if i == 'Acidithiobacillus ferrooxidans':
        #kin_list.append(uclust[uclust.Query.str.contains('501530536')].Target.values[0])
        #add type I AfM
        kin_list.append(uclust[uclust.Query.str.contains('226736656')].Target.values[0])
        #kin_list.append(uclust[uclust.Query.str.contains('1427065124')].Target.values[0])
        continue        
        
    if len(uclust[uclust.Query.str.contains(i)].Target.unique()) != 0:
        if len(uclust[uclust.Query.str.contains(i)].Target.unique()) > 1:
            tmp = uclust[uclust.Query.str.contains(i)].Target.unique()
            kin_list.append(tmp[np.argmax([len(x) for x in tmp])])
        else:
            kin_list.append(uclust[uclust.Query.str.contains(i)].Target.unique()[0])

pd.DataFrame(kin_list,columns=['kinetic_ID']).to_csv('../output/00_100p_tree/uclust_all_1_kinetic_data.csv',index=False)



In [16]:
#pd.DataFrame([[x,len(x.split('Thiobacillus'))] for x in kin_list]).sort_values(by=1,ascending=False)
i = 'Thiobacillus denitrificans'
uclust[uclust.Query.str.contains(i)].Target.unique()

#uclust[uclust.Query.str.contains('Thiobacillus denitrificans')]

array(['Thiobacillus_denitrificans_AAA9917_REF',
       'gi|516743924|ref|WP_018078338.1| ribulose-bisphosphate carboxylase [Thiobacillus denitrificans]',
       'gi|516743937|ref|WP_018078351.1| ribulose-bisphosphate carboxylase large subunit [Thiobacillus denitrificans]',
       'gi|981552503|ref|WP_059755372.1| ribulose 1,5-bisphosphate carboxylase [Thiobacillus denitrificans] >gi|979417419|gb|KVW95846.1| ribulose 1,5-bisphosphate carboxylase [Thiobacillus denitrificans]',
       'gi|499632402|ref|WP_011313136.1| ribulose bisphosphate carboxylase large chain [Thiobacillus denitrificans] >gi|3183147|sp|Q56259.2|RBL1_THIDA RecName: Full=Ribulose bisphosphate carboxylase large chain; Short=RuBisCO large subunit >gi|2411435|gb|AAB70697.1| ribulose-1,5-bisphosphate carboxylase/oxygenase large subunit [Thiobacillus denitrificans] >gi|74058137|gb|AAZ98577.1| ribulose-bisphosphate carboxylase form I large chain [Thiobacillus denitrificans ATCC 25259]'],
      dtype=object)

In [94]:
kin_data = pd.read_excel('../data/flamholz_et_al_kinetic_data_modified_20190611.xlsx','kinetic_data')
kin_data = kin_data[~kin_data.vC.isna()]
kin_data = kin_data[kin_data['use'] == 1]
species = kin_data['Unnamed: 0'].apply(lambda x: " ".join(x.split(' ')[:2]))
species = species.unique()
species = pd.np.append(species,['Methanococcus jannaschii','Methanosarcina acetivorans'])
#species
len(species)


224

In [95]:
uclust = pd.read_csv('../output/00_100p_tree/uclust_all_1.csv')
uclust = uclust[~uclust.Query.str.contains('499709017')]

kin_list = []
found_species = []
for i in species:
    # Specific species which have both type I/IIIb and type IV
    if i == 'Rhodopseudomonas palustris':
        kin_list.append(uclust[uclust.Query.str.contains('90104852')].Target.values[0])
        found_species.append(i)
        continue
    if i == 'Allochromatium vinosum':
        kin_list.append(uclust[uclust.Query.str.contains('502735590')].Target.values[0])
        found_species.append(i)
        continue
    if i == 'Archaeoglobus fulgidus':
        kin_list.append(uclust[uclust.Query.str.contains('973026383')].Target.values[0])
        found_species.append(i)
        continue
    #if i == 'Acidithiobacillus ferrooxidans':
        #kin_list.append(uclust[uclust.Query.str.contains('501530536')].Target.values[0])
        #kin_list.append(uclust[uclust.Query.str.contains('1427065124')].Target.values[0])
    #    continue        
        
    if len(uclust[uclust.Query.str.contains(i)].Target.unique()) != 0:
        if len(uclust[uclust.Query.str.contains(i)].Target.unique()) > 1:
            tmp = uclust[uclust.Query.str.contains(i)].Target.unique()
            kin_list.append(tmp[np.argmax([len(x) for x in tmp])])
            found_species.append(i)
        else:
            kin_list.append(uclust[uclust.Query.str.contains(i)].Target.unique()[0])
            found_species.append(i)
            




In [100]:
#uclust[uclust.Query.str.contains(species[1])].Target
#species_df[~species_df[0].isin(found_species)].to_csv('../output/00_100p_tree/missing_species.csv')
species_df = pd.DataFrame(species)
#species_df[~species_df[0].isin(found_species)]
len(kin_list)

196

In [88]:
new = pd.read_excel('../data/DatasetS2_RubiscoKinetics_Merged_yinon.xlsx','Sheet2')
new_species = new['Unnamed: 3'].drop_duplicates()
species_df = pd.DataFrame(species)
species_df[~species_df[0].isin(new_species)]
#new_species[~new_species[0].isin(species_df)]


Unnamed: 0,0
1,Synechococcus sp.
11,Limonium antonii-llorensii
12,Limonium leonardi-llorensii
32,Agave victoriae-reginae
48,Thermosynechococcus elongatus
61,Coffea arabica
62,Lactuca sativa
74,Archaeoglobus fulgidus
79,Chenopodium album
82,Flaveria chlorifolia


In [86]:
from Bio import Entrez
Entrez.email = "yinonmoise.baron@weizmann.ac.il"
species_standard = []
for i in species:
    handle = Entrez.esearch(db="Taxonomy", retmax=10, term=i)
    record = Entrez.read(handle)
    if len(record['IdList']) >0:
        handle = Entrez.esummary(db="Taxonomy", id=record['IdList'][0])
        record = Entrez.read(handle)
        species_standard.append(' '.join([record[0]['Genus'],record[0]['Species']]))


HTTPError: HTTP Error 429: Too Many Requests

In [87]:
len(species_standard)

90

In [14]:
synth_data = pd.DataFrame([x.description for x in SeqIO.parse('../data/milo_synthetized_rubisco.faa',format='fasta')])
uclust.merge(synth_data,left_on='Query',right_on=0).Target.to_csv('../output/00_100p_tree/synth_data.csv',index=False)

  


In [8]:
uclust = pd.read_csv('../output/00_100p_tree/uclust_all_1.csv')
synth_data = pd.DataFrame([x.description for x in SeqIO.parse('../data/milo_synthetized_rubisco.faa',format='fasta')])
t = uclust.merge(synth_data,left_on='Query',right_on=0)
t['Internal ID'] = t[0].apply(lambda x: x.split(' ')[0])
t[[0,'Target','Internal ID']].to_csv('../output/00_100p_tree/milo_syn_100p.csv')