In [1]:
import pandas as pd
from Bio import SeqIO, AlignIO, Seq
import numpy as np
from collections import Counter
from helper import *
import python_cipres.client as CipresClient

### Cluster sequences using uclust

In [2]:
! mkdir -p ../output/01_70p_tree
!../bin/usearch11.0.667_i86linux32 -cluster_fast ../output/00_100p_tree/uclust_all_1.faa -id 0.7 -uc ../output/01_70p_tree/uclust_all_0.7.uc

usearch v11.0.667_i86linux32, 4.0Gb RAM (16.3Gb total), 8 cores
(C) Copyright 2013-18 Robert C. Edgar, all rights reserved.
https://drive5.com/usearch

License: yinonmoise.baron@weizmann.ac.il

00:00 67Mb    100.0% Reading ../output/00_100p_tree/uclust_all_1.faa
00:00 105Mb   100.0% DF
00:00 105Mb  40945 seqs, 40945 uniques, 40945 singletons (100.0%)
00:00 105Mb  Min size 1, median 1, max 1, avg 1.00
00:00 111Mb   100.0% DB
00:03 176Mb   100.0% 884 clusters, max size 17546, avg 46.3
                                                           
      Seqs  40945 (40.9k)
  Clusters  884
  Max size  17546 (17.5k)
  Avg size  46.3
  Min size  1
Singletons  452, 1.1% of seqs, 51.1% of clusters
   Max mem  176Mb
      Time  3.00s
Throughput  13.6k seqs/sec.



### Take only cluster fasta files and create csv file

In [3]:
parse_uclust(infile='../output/01_70p_tree/uclust_all_0.7.uc',
             fasta='../output/00_100p_tree/uclust_all_1.faa',
             outfasta='../output/01_70p_tree/uclust_all_0.7.faa',
             outfile='../output/01_70p_tree/uclust_all_0.7.csv'
            )

## Create multiple sequence alignment

In [4]:
!../bin/mafft-linux64/mafft.bat ../output/01_70p_tree/uclust_all_0.7.faa > ../output/01_70p_tree/uclust_all_0.7.aln

nthread = 0
nthreadpair = 0
nthreadtb = 0
ppenalty_ex = 0
stacksize: 8192 kb
rescale = 1
Gap Penalty = -1.53, +0.00, +0.00



Making a distance matrix ..
  801 / 884
done.

Constructing a UPGMA tree (efffree=0) ... 
  880 / 884
done.

Progressive alignment 1/2... 
STEP    19 / 883 
Reallocating..done. *alloclen = 2637
STEP   801 / 883 
Reallocating..done. *alloclen = 4617

done.

Making a distance matrix from msa.. 
  800 / 884
done.

Constructing a UPGMA tree (efffree=1) ... 
  880 / 884
done.

Progressive alignment 2/2... 
STEP   168 / 883 
Reallocating..done. *alloclen = 2492
STEP   801 / 883 
Reallocating..done. *alloclen = 3633

Reallocating..done. *alloclen = 4732

done.

disttbfast (aa) Version 7.427
alg=A, model=BLOSUM62, 1.53, -0.00, -0.00, noshift, amax=0.0
0 thread(s)


Strategy:
 FFT-NS-2 (Fast but rough)
 Progressive method (guide trees were built 2 times.)

If unsure which option to use, try 'mafft --auto input > output'.
For more information, see 'mafft --help', 'mafft -

## Clean MSA to contain only positions with more than 5% coverage (based on Jaffe et al. 2018)

In [5]:
clean_aln(infile='../output/01_70p_tree/uclust_all_0.7.aln',
          outfile='../output/01_70p_tree/uclust_all_0.7_trimmed.aln')

### Run RaxML on CIPRES

In [6]:
cip = CipresClient.Client(appname='RO',
                    appID='rubisco_phylogeny-49F87B124F3D429FBE12F95E4254DDEA',
                    baseUrl='https://cipresrest.sdsc.edu/cipresrest/v1',
                    username='yinonbaron',
                    password='Mchcav11~')


In [7]:

job = cip.submitJob(vParams={'toolId': 'RAXMLHPC8_REST_XSEDE',
                       'datatype_': 'protein',
                       'runtime_': '160',
                       'select_analysis_': 'fa',
                       'choose_bootstrap_': 'x',
                       'printbrlength_': '1'},
              inputParams={'infile_': '../output/01_70p_tree/uclust_all_0.7_trimmed.aln'},
              metadata={'statusEmail':'true'})

In [9]:
if cip.getJobStatus(jobHandle=job.jobHandle).isDone():
    !mkdir -p ../output/01_70p_tree/RaxML/
    job.downloadResults('../output/01_70p_tree//RaxML/')
else:
    print('Job ' + job.jobHandle + ' not finished')

In [3]:
#!curl -u yinonbaron:Mchcav11~ -H cipres-appkey:rubisco_phylogeny-49F87B124F3D429FBE12F95E4254DDEA https://cipresrest.sdsc.edu/cipresrest/v1/job/yinonbaron -F tool=RAXMLHPC8_REST_XSEDE  -F input.infile_=@../output/01_70p_tree/uclust_all_0.7_trimmed.aln -F metadata.statusEmail=true -F vparam.datatype_=protein -F vparam.runtime_=160 -F  vparam.select_analysis_=fa -F vparam.choose_bootstrap_=x -F vparam.printbrlength_=1

In [3]:
# Optional - create tree using fasttree
#!../bin/FastTree -spr 4 -mlacc 2 -slownni ../output/03_uclust_all_0.7_trimmed.aln > ../output/04_uclust_all_0.7.nwk

In [2]:
add_type(type_file='../output/00_100p_tree/uclust_all_1_rubisco_types.csv',
         seq_file='../output/01_70p_tree/uclust_all_0.7.csv',
         outfile='../output/01_70p_tree/type_legend.txt')

In [1]:
uc = pd.read_csv('../output/01_70p_tree/uclust_all_0.7.csv')
kin = pd.read_csv('../output/00_100p_tree/uclust_all_1_kinetic_data.csv')
m = uc.merge(kin,left_on = 'Query', right_on = 'kinetic_ID')
m['ID'] = m.Target.apply(lambda x: x.split(' ')[0])
m2 = m.groupby('ID')['kinetic_ID'].count()


#lines = m['Target'].apply(lambda x: x.split(' ')[0]).values + ','+ uclust_data['kinetic_flag'].values+','+uclust_data['syn_flag'].values+'\n'
lines = m2.index + ',' + [str(pd.np.log10(x)+0.15) for x in m2.values] + '\n'
unique_lines = np.unique(lines)

with open('../data/dataset_simplebar_template.txt','r') as file:
    with open('../output/01_70p_tree/kinetic_bar.txt', "w") as f1:
        for row in file:
            f1.write(row)
        for line in unique_lines:
            f1.write(line)
        file.close()
        f1.close()

m2

ID
RBC3_1                                 1
RBCSeed_14                             1
RBCSeed_25                             1
Tk                                     1
gi|1271067907|gb|PHT72509.1|           2
gi|1463752065|ref|WP_116707191.1|      1
gi|15281545|gb|AAK94305.1|             2
gi|29726049|gb|AAL46534.1|             2
gi|358252603|gb|AEU04903.1|            8
gi|388537986|gb|AFK63174.1|            1
gi|494102711|ref|WP_007043498.1|       1
gi|503328002|ref|WP_013562663.1|       1
gi|518331021|ref|WP_019501228.1|     161
gi|523387592|emb|CDF56957.1|           2
gi|703265142|emb|CDI73524.1|          12
gi|851308879|ref|WP_048172188.1|       1
Name: kinetic_ID, dtype: int64

In [29]:
type_file='../output/00_100p_tree/uclust_all_1_rubisco_types.csv'
seq_file='../output/01_70p_tree/uclust_all_0.7.csv'
outfile='../output/01_70p_tree/type_legend.txt'

labels = pd.read_csv(type_file)
uclust_data = pd.read_csv(seq_file)
uclust_data = uclust_data[uclust_data['Type'] !='S']
uclust_data.loc[uclust_data['Target'] == '*','Target'] = uclust_data.loc[uclust_data['Target'] == '*','Query']
uclust_data = uclust_data.merge(labels, left_on='Query', right_on='ID',how='left')
labeled_leaves = uclust_data[~pd.isna(uclust_data['type'])]
color_map = {'I': '#28B463',
             'II': '#E74C3C',
             'II/III':'#AF7AC5',
             'IIIa':'#AED6F1',
             'IIIb':'#3498DB',
             'IIIc':'#1F618D',
             'IIIlike':'#5D6D7E',
             'IV':'#F4D03F',
             'IVlike':'#F8C471',
             'unknown':'#F442D4'}
lines = labeled_leaves['Target'].apply(lambda x: x.split(' ')[0]).values +[',label,node,'+color_map[x]+',1,normal\n' for x in labeled_leaves['type'].values]
lines = set(lines)
'''
with open('../data/itol_legend_template.txt','r') as file:
    with open(outfile, "w") as f1:
        for row in file:
            f1.write(row)
        for line in lines:
            f1.write(line)
        file.close()
        f1.close()
        '''

'\nwith open(\'../data/itol_legend_template.txt\',\'r\') as file:\n    with open(outfile, "w") as f1:\n        for row in file:\n            f1.write(row)\n        for line in lines:\n            f1.write(line)\n        file.close()\n        f1.close()\n        '