In [1]:
import pandas as pd
from Bio import SeqIO, AlignIO, Seq
import numpy as np
from collections import Counter
from helper import *
import python_cipres.client as CipresClient

### Extract fasta sequences for the autotrophic rubiscos
Before running this code we need to generate a file called `autotrophic_rubisco_70p.csv` by selecting in ITOL clades that belong to Rubisco types 1,2,2/3,3a,3c,3-like,3b and IV. We replaced sapces in the sequence ID to `_` to match the sequence ID in the fasta files. We take only sequences which are not type IV or type III-b as autotrophic.

In [19]:
uclust_data = pd.read_csv('../output/01_70p_tree/uclust_all_0.7.csv')
uclust_data = uclust_data[uclust_data['Type'] !='S']
uclust_data.loc[uclust_data['Target'] == '*','Target'] = uclust_data.loc[uclust_data['Target'] == '*','Query']
uclust_data['cut Target'] = uclust_data.Target.apply(lambda x: x.split(' ')[0])
#auto_id = [x.replace('\n','') for x in open('../output/01_70p_tree/autotrophic_rubisco_70p.txt').readlines() if x != '\n']
rubisco_types = pd.read_csv('../output/01_70p_tree/rubisco_types_70p.csv')
autotrophic_70p = rubisco_types[~rubisco_types['type'].isin(['IV','IIIb'])]
auto_rubisco = uclust_data.merge(autotrophic_70p, left_on='cut Target', right_on='ID')
autotrophic_rubisco  = auto_rubisco['Query'].values

auto_seq = []
for record in SeqIO.parse('../output/01_70p_tree/merged_data_clean.faa', "fasta"):
    if record.description in autotrophic_rubisco:
        auto_seq.append(record)
!mkdir -p ../output/02_90p_autotrophic_rubisco_tree
with open(r"../output/02_90p_autotrophic_rubisco_tree/autotrophic_rubisco_seq.faa", "w") as output_handle:
    SeqIO.write(auto_seq, output_handle, "fasta")

In [20]:
auto_rubisco.to_csv('../output/02_90p_autotrophic_rubisco_tree/uclust_all_0.7_with_type.csv')
auto_rubisco.groupby('type')['Query'].nunique()

type
I              45050
II               595
II/III           140
III-like         176
IIIa              65
IIIc              62
IV-outgroup        1
Name: Query, dtype: int64

### Cluster sequences using uclust

In [21]:
!../bin/usearch11.0.667_i86linux32 -cluster_fast ../output/02_90p_autotrophic_rubisco_tree/autotrophic_rubisco_seq.faa -id 0.9 -uc ../output/02_90p_autotrophic_rubisco_tree/auto_uclust_all_0.9.uc

usearch v11.0.667_i86linux32, 4.0Gb RAM (16.3Gb total), 8 cores
(C) Copyright 2013-18 Robert C. Edgar, all rights reserved.
https://drive5.com/usearch

License: yinonmoise.baron@weizmann.ac.il

00:00 71Mb    100.0% Reading ../output/02_90p_autotrophic_rubisco_tree/autotrophic_rubisco_seq.faa
00:00 110Mb   100.0% DF
00:00 111Mb  46287 seqs, 45447 uniques, 44939 singletons (98.9%)
00:00 111Mb  Min size 1, median 1, max 48, avg 1.02
00:00 118Mb   100.0% DB
00:02 175Mb   100.0% 776 clusters, max size 8526, avg 59.6
                                                          
      Seqs  45447 (45.4k)
  Clusters  776
  Max size  8526
  Avg size  59.6
  Min size  1
Singletons  244, 0.5% of seqs, 31.4% of clusters
   Max mem  175Mb
      Time  2.00s
Throughput  22.7k seqs/sec.



### Take only cluster fasta files and create csv file

In [22]:
parse_uclust(infile='../output/02_90p_autotrophic_rubisco_tree/auto_uclust_all_0.9.uc',
             fasta='../output/02_90p_autotrophic_rubisco_tree/autotrophic_rubisco_seq.faa',
             outfasta='../output/02_90p_autotrophic_rubisco_tree/auto_uclust_all_0.9.faa',
             outfile='../output/02_90p_autotrophic_rubisco_tree/auto_uclust_all_0.9.csv'
            )

In [23]:
auto_rub_90p = pd.read_csv('../output/02_90p_autotrophic_rubisco_tree/auto_uclust_all_0.9.csv')
auto_rub_90p = auto_rub_90p[auto_rub_90p['Type'] !='S']
auto_rub_90p.loc[auto_rub_90p['Target'] == '*','Target'] = auto_rub_90p.loc[auto_rub_90p['Target'] == '*','Query']

auto_rub90 = auto_rubisco.merge(auto_rub_90p,left_on='Query',right_on='Query')
auto_rub90.to_csv('../output/02_90p_autotrophic_rubisco_tree/uclust_all_0.9_with_type.csv')
auto_rub90.groupby('type')['Target_y'].nunique()

type
I              469
II             120
II/III          48
III-like        67
IIIa            39
IIIc            32
IV-outgroup      1
Name: Target_y, dtype: int64

## Create multiple sequence alignment

In [24]:
!../bin/mafft-linux64/mafft.bat ../output/02_90p_autotrophic_rubisco_tree/auto_uclust_all_0.9.faa > ../output/02_90p_autotrophic_rubisco_tree/auto_uclust_all_0.9.aln

nthread = 0
nthreadpair = 0
nthreadtb = 0
ppenalty_ex = 0
stacksize: 8192 kb
rescale = 1
Gap Penalty = -1.53, +0.00, +0.00



Making a distance matrix ..
  701 / 776
done.

Constructing a UPGMA tree (efffree=0) ... 
  770 / 776
done.

Progressive alignment 1/2... 
STEP   501 / 775 
Reallocating..done. *alloclen = 2430
STEP   701 / 775 
Reallocating..done. *alloclen = 3887

done.

Making a distance matrix from msa.. 
  700 / 776
done.

Constructing a UPGMA tree (efffree=1) ... 
  770 / 776
done.

Progressive alignment 2/2... 
STEP   701 / 775 
Reallocating..done. *alloclen = 2560

Reallocating..done. *alloclen = 3616

done.

disttbfast (aa) Version 7.427
alg=A, model=BLOSUM62, 1.53, -0.00, -0.00, noshift, amax=0.0
0 thread(s)


Strategy:
 FFT-NS-2 (Fast but rough)
 Progressive method (guide trees were built 2 times.)

If unsure which option to use, try 'mafft --auto input > output'.
For more information, see 'mafft --help', 'mafft --man' and the mafft page.

The default gap scoring sche

## Clean MSA to contain only positions with more than 5% coverage (based on Jaffe et al. 2018)

In [25]:
clean_aln(infile='../output/02_90p_autotrophic_rubisco_tree/auto_uclust_all_0.9.aln',
          outfile='../output/02_90p_autotrophic_rubisco_tree/auto_uclust_all_0.9_trimmed.aln')

In [23]:
cip = CipresClient.Client(appname='RO',
                    appID='rubisco_phylogeny-49F87B124F3D429FBE12F95E4254DDEA',
                    baseUrl='https://cipresrest.sdsc.edu/cipresrest/v1',
                    username='yinonbaron',
                    password='Mchcav11~')


In [27]:
job = cip.submitJob(vParams={'toolId': 'RAXMLHPC8_REST_XSEDE',
                       'datatype_': 'protein',
                       'runtime_': '160',
                       'select_analysis_': 'fa',
                       'choose_bootstrap_': 'x',
                       'printbrlength_': '1'},
              inputParams={'infile_': '../output/02_90p_autotrophic_rubisco_tree/auto_uclust_all_0.9_trimmed.aln'},
              metadata={'statusEmail':'true'})

In [30]:
job = cip.listJobs()[-1]

In [31]:
if cip.getJobStatus(jobHandle=job.jobHandle).isDone():
    !mkdir -p ../output/02_90p_autotrophic_rubisco_tree/RaxML/
    job.downloadResults('../output/02_90p_autotrophic_rubisco_tree/RaxML/')
else:
    print('Job ' + job.jobHandle + ' not finished')

In [2]:
add_type(type_file='../data/jaffe_et_al_2018_rubisco_types.csv',
         seq_file='../output/02_90p_autotrophic_rubisco_tree/auto_uclust_all_0.9.csv',
         outfile='../output/02_90p_autotrophic_rubisco_tree/type_legend.txt')

In [2]:
add_kinetic(kinetic_file='../data/flamholz_et_al_2019_kinetically_characterized.faa',
            synth_file='../data/milo_synthetized_rubisco.faa',
            seq_file='../output/02_90p_autotrophic_rubisco_tree/auto_uclust_all_0.9.csv',
            outfile='../output/02_90p_autotrophic_rubisco_tree/kinetic_legend.txt')