In [1]:
import pandas as pd
from Bio import SeqIO, AlignIO, Seq
import numpy as np
from collections import Counter
from helper import *

### Preprocessing

The first thing we need to do is to remove places with more than one `_` from the ids because after moving to iTOL this will be a problem. We do this for the file  `RuBisCO.300-700.faa` and save it with the suffix 'processed'

#### Creating csv file from the data in Jaffe et al. and cleaning sequence ID from ":"

In [2]:
sequences = []
for record in SeqIO.parse('../data/jaffe_et_al_2018_rubisco_superfamilies.faa', "fasta"):
    record.description = record.description.split(':')[0]
    record.id = record.id.split(':')[0]
    record.name = record.name.split(':')[0]
    sequences.append(record)
!mkdir -p ../output/00_100p_tree
with open(r"../output/00_100p_tree/jaffe_et_al_2018_rubisco_superfamilies_clean.faa", "w") as output_handle:
    SeqIO.write(sequences, output_handle, "fasta")

#### Removing * from the end of the fasta file RuBisCO.300-700

In [3]:
sequences = []
for record in SeqIO.parse('../data/RuBisCO.300-700_20190429_processed.faa', "fasta"):
    record.seq = record.seq.rstrip(chars='*')
    sequences.append(record)
        
with open(r"../output/00_100p_tree/RuBisCO.300-700_clean.faa", "w") as output_handle:
    SeqIO.write(sequences, output_handle, "fasta")

### Merge fasta files and cluster sequences

In [4]:
#!cat ../data/milo_synthetized_rubisco.faa ../data/RuBisCO.300-700.faa ../data/flamholz_et_al_2019_kinetically_characterized.faa ../data/erb_et_al_2012_rubisco_superfamilies.faa > ../data/merged_data.faa
!cat ../data/milo_synthetized_rubisco.faa ../output/00_100p_tree/RuBisCO.300-700_clean.faa ../data/flamholz_et_al_2019_kinetically_characterized.faa ../output/00_100p_tree/jaffe_et_al_2018_rubisco_superfamilies_clean.faa > ../output/00_100p_tree/merged_data.faa

### remove sequences with ambiguous calls 

In [5]:
aa = ['A','R','N','D','C','G','Q','E','H','I','L','K','M','F','P','S','T','W','Y','V']
ambiguous_aa = ['B','Z','X','J']
canon = []
for record in SeqIO.parse('../output/00_100p_tree/merged_data.faa', "fasta"):
    no_of_aa = np.array([x in ambiguous_aa for x in record.seq]).sum()
    if no_of_aa == 0:
        canon.append(record)

with open(r"../output/00_100p_tree/merged_data_clean.faa", "w") as output_handle:
    SeqIO.write(canon, output_handle, "fasta")

### Cluster sequences based on 100 ID

In [6]:
!../bin/usearch11.0.667_i86linux32 -cluster_fast ../output/00_100p_tree/merged_data_clean.faa -id 1 -uc ../output/00_100p_tree/uclust_all_1.uc

usearch v11.0.667_i86linux32, 4.0Gb RAM (16.3Gb total), 8 cores
(C) Copyright 2013-18 Robert C. Edgar, all rights reserved.
https://drive5.com/usearch

License: yinonmoise.baron@weizmann.ac.il

00:00 78Mb    100.0% Reading ../output/00_100p_tree/merged_data_clean.faa
00:00 118Mb   100.0% DF
00:00 120Mb  57626 seqs, 56166 uniques, 55141 singletons (98.2%)
00:00 120Mb  Min size 1, median 1, max 48, avg 1.03
00:00 127Mb   100.0% DB
02:35 316Mb   100.0% 40945 clusters, max size 72, avg 1.4
                                                         
      Seqs  56166 (56.2k)
  Clusters  40945 (40.9k)
  Max size  72
  Avg size  1.4
  Min size  1
Singletons  32331 (32.3k), 57.6% of seqs, 79.0% of clusters
   Max mem  316Mb
      Time  02:35
Throughput  362.4 seqs/sec.



### Refine cluster centroids to include synthesized sequences

In [7]:
header = ['Type','Cluster','Size','%Id','Strand','Qlo','Tlo','Alignment','Query','Target']
uclust = pd.read_csv('../output/00_100p_tree/uclust_all_1.uc', sep='\t', names=header, index_col=False)
uclust = uclust[uclust['Type'] !='S']
uclust.loc[uclust['Target'] =='*','Target'] = uclust.loc[uclust['Target'] =='*','Query']

# Find synthetized sequences that are not centroids and replace them as centroids
synth_clusters = uclust[uclust['Query'].str.startswith('RBC') & ~uclust['Target'].str.startswith('RBC')]
for i in synth_clusters.iterrows():
    uclust.loc[uclust.Target== i[1].Target,'Target'] = i[1].Query

centroids = uclust[uclust['Type']=='C']
c_list = centroids.Target.values
c_list = [c.split(" ")[0] for c in c_list]

sequences = []
for record in SeqIO.parse('../output/00_100p_tree/merged_data_clean.faa', "fasta"):
    if record.id in c_list:
        sequences.append(record)

seq2 = []
seq3 = []
for i,record in enumerate(sequences):
    if not record.id in seq2: 
        seq2.append(record.id)
        seq3.append(record)
with open('../output/00_100p_tree/uclust_all_1.faa', "w") as output_handle:
    SeqIO.write(seq3, output_handle, "fasta")

uclust.to_csv('../output/00_100p_tree/uclust_all_1.csv',index=False)

### Adapt type legend to the new centroids

This will also throw some of the sequences in jaffe_2018 because of ambiguous amino acids

In [8]:
type_labels = pd.read_csv('../data/jaffe_et_al_2018_rubisco_types_processed.csv')
cluster_labels = uclust.merge(type_labels,left_on='Query',right_on='ID')[['Target','Organism','type']]
cluster_labels.columns = ['ID','Organism','type']
cluster_labels.to_csv('../output/00_100p_tree/uclust_all_1_rubisco_types.csv',index=False)

In [20]:
# remove type IV rubisco of rubrum to avoid mischaracterization
uclust = uclust[~uclust.Query.str.contains('499709017')]

kin_data = pd.read_excel('../data/flamholz_et_al_kinetic_data.xlsx','kinetic_data')
kin_data = kin_data[~kin_data.vC.isna()]
species = kin_data['Unnamed: 0'].apply(lambda x: " ".join(x.split(' ')[:2]))
species = species.unique()
kin_list = []
for i in species:
    if len(uclust[uclust.Query.str.contains(i)].Target.unique()) != 0:
        if len(uclust[uclust.Query.str.contains(i)].Target.unique()) > 1:
            tmp = uclust[uclust.Query.str.contains(i)].Target.unique()
            kin_list.append(tmp[np.argmax([len(x) for x in tmp])])
        else:
            kin_list.append(uclust[uclust.Query.str.contains(i)].Target.unique()[0])

pd.DataFrame(kin_list,columns=['kinetic_ID']).to_csv('../output/00_100p_tree/uclust_all_1_kinetic_data.csv',index=False)

  import sys


In [10]:
synth_data = pd.DataFrame([x.description for x in SeqIO.parse('../data/milo_synthetized_rubisco.faa',format='fasta')])
uclust.merge(synth_data,left_on='Query',right_on=0).Target.to_csv('../output/00_100p_tree/synth_data.csv',index=False)

  
