In [3]:
import pandas as pd
from Bio import SeqIO, AlignIO, Seq
import numpy as np
from collections import Counter
from helper import *
import python_cipres.client as CipresClient

### Extract fasta sequences for the autotrophic rubiscos
Before running this code we need to generate a file called `autotrophic_rubisco_70p.csv` by selecting in ITOL clades that belong to Rubisco types 1,2,2/3,3a,3c,3-like,3b and IV. We replaced sapces in the sequence ID to `_` to match the sequence ID in the fasta files. We take only sequences which are not type IV or type III-b as autotrophic.

In [2]:
uclust_data = pd.read_csv('../output/01_70p_tree/uclust_all_0.7.csv')
uclust_data['cut Target'] = uclust_data.Target.apply(lambda x: x.split(' ')[0])

rubisco_types = pd.read_csv('../output/01_70p_tree/rubisco_types_70p.csv')

rubisco_with_type = uclust_data.merge(rubisco_types, left_on='cut Target', right_on='ID')
#rubisco_with_type[rubisco_with_type.type != 'IV'].groupby('type')['Query'].count()
type1 = rubisco_with_type[rubisco_with_type.type=='I']

In [6]:
uclust_data = pd.read_csv('../output/01_70p_tree/uclust_all_0.7.csv')
uclust_data['cut Target'] = uclust_data.Target.apply(lambda x: x.split(' ')[0])
rubisco_types = pd.read_csv('../output/01_70p_tree/rubisco_types_70p.csv')
carbo_rucisco = rubisco_types[~rubisco_types['type'].isin(['IV'])]
auto_rubisco = uclust_data.merge(carbo_rucisco, left_on='cut Target', right_on='ID')
true_rubisco  = auto_rubisco['Query'].values

auto_seq = []
for record in SeqIO.parse('../output/00_100p_tree/uclust_all_1.faa', "fasta"):
    if record.description in true_rubisco:
        auto_seq.append(record)
!mkdir -p ../output/02_90p_autotrophic_rubisco_tree
with open(r"../output/02_90p_autotrophic_rubisco_tree/true_rubisco_seq.faa", "w") as output_handle:
    SeqIO.write(auto_seq, output_handle, "fasta")

### Cluster sequences using uclust

In [10]:
!../bin/usearch11.0.667_i86linux32 -cluster_fast ../output/02_90p_autotrophic_rubisco_tree/true_rubisco_seq.faa -id 0.9 -uc ../output/02_90p_autotrophic_rubisco_tree/true_uclust_all_0.9.uc

usearch v11.0.667_i86linux32, 4.0Gb RAM (16.3Gb total), 8 cores
(C) Copyright 2013-18 Robert C. Edgar, all rights reserved.
https://drive5.com/usearch

License: yinonmoise.baron@weizmann.ac.il

00:00 62Mb    100.0% Reading ../output/02_90p_autotrophic_rubisco_tree/true_rubisco_seq.faa
00:00 99Mb    100.0% DF
00:00 100Mb  33565 seqs, 33565 uniques, 33565 singletons (100.0%)
00:00 100Mb  Min size 1, median 1, max 1, avg 1.00
00:00 105Mb   100.0% DB
00:01 167Mb   100.0% 1169 clusters, max size 6563, avg 28.7
                                                           
      Seqs  33565 (33.6k)
  Clusters  1169
  Max size  6563
  Avg size  28.7
  Min size  1
Singletons  674, 2.0% of seqs, 57.7% of clusters
   Max mem  167Mb
      Time  1.00s
Throughput  33.6k seqs/sec.



### Take only cluster fasta files and create csv file

In [11]:
parse_uclust(infile='../output/02_90p_autotrophic_rubisco_tree/true_uclust_all_0.9.uc',
             fasta='../output/02_90p_autotrophic_rubisco_tree/true_rubisco_seq.faa',
             outfasta='../output/02_90p_autotrophic_rubisco_tree/true_uclust_all_0.9.faa',
             outfile='../output/02_90p_autotrophic_rubisco_tree/true_uclust_all_0.9.csv'
            )

### Remove Sequences from Jaffe et al. - they come from a later query of the refseq DB

In [12]:
true_rubisco = pd.read_csv('../output/02_90p_autotrophic_rubisco_tree/true_uclust_all_0.9.csv')
jaffe = pd.read_csv('../data/jaffe_et_al_2018_rubisco_types_processed.csv')
true_rubisco = true_rubisco[~true_rubisco.Query.isin(jaffe.ID)]
true_rubisco['ID'] = true_rubisco.Target.apply(lambda x: x.split(' ')[0])

#true_rubisco_with_type = true_rubisco.merge(rubisco_types, left_on='ID', right_on='ID')
true_rubisco_with_type = uclust_data.merge(true_rubisco,left_on='Query',right_on='Query',suffixes=('_70','_90'))
true_rubisco_with_type = true_rubisco_with_type.merge(rubisco_types,left_on='cut Target', right_on='ID')
true_rubisco_with_type.groupby('type')['Query'].count()
true_rubisco_with_type.groupby('type')['Target_90'].nunique()

type
I         460
II        120
II/III     23
III       232
Name: Target_90, dtype: int64

In [33]:
p90 = true_rubisco_with_type[true_rubisco_with_type.type.isin(['II','II/III'])]
syn = pd.read_csv('../output/00_100p_tree/milo_syn_100p.csv')
r = p90.merge(syn,left_on='Query',right_on='0')
#p90.Target_90.nunique()
#syn['0'].nunique()
#
#p90[~p90.Target_90.isin(r.Target_90.unique())].Target_90.unique()

array(['RBG_16_Gammaproteobacteria_62_13_RBG_16_scaffold_11150_2 ribulose-bisphosphate carboxylase (EC:4.1.1.39); K01601 ribulose-bisphosphate carboxylase large chain [EC:4.1.1.39] Tax=RBG_16_Gammaproteobacteria_62_13_curated id=87108755 bin="Candidatus Muproteobacteria bacterium RBG_16_62_13" species=RBG_16_Gammaproteobacteria_62_13_curated genus=unknown taxon_order=unknown taxon_class=Gammaproteobacteria phylum=Proteobacteria organism_tax=RBG_16_Gammaproteobacteria_62_13_curated',
       'cg2_3.0_scaffold_9508_c_3 cbbM; ribulose bisphosphate carboxylase (EC:4.1.1.39); K01601 ribulose-bisphosphate carboxylase large chain [EC:4.1.1.39] Tax=CG_Rhodof_03 id=92052342 bin="CG2_30_FULL_Comamonadaceae_60_41_curated" species=CG_Rhodof_03 genus=Rhodoferax taxon_order=Burkholderiales taxon_class=Betaproteobacteria phylum=Proteobacteria organism_tax=CG_Rhodof_03, Rhodoferax, Burkholderiales, Betaproteobacteria, Proteobacteria, Bacteria',
       'gi|1232622198|gb|OYY58774.1| ribulose 1,5-bisphosp

In [34]:
#auto_rubisco_with_type = true_rubisco_with_type[true_rubisco_with_type.type.isin(['I','II','II/III'])]
auto_rubisco_with_type = true_rubisco_with_type[true_rubisco_with_type.type.isin(['I'])]
auto_rubisco_with_type.to_csv('../output/02_90p_autotrophic_rubisco_tree/type1_rubisco_0.9_with_type.csv')
auto_rubisco = true_rubisco[true_rubisco.Query.isin(auto_rubisco_with_type.Query)]
auto_rubisco.to_csv('../output/02_90p_autotrophic_rubisco_tree/type1_uclust_all_0.9.csv')
auto_rubisco_ID  = auto_rubisco['Target'].unique()

auto_seq = []
for record in SeqIO.parse('../output/02_90p_autotrophic_rubisco_tree/true_uclust_all_0.9.faa', "fasta"):
    if record.description in auto_rubisco_ID:
        auto_seq.append(record)
!mkdir -p ../output/02_90p_autotrophic_rubisco_tree
with open(r"../output/02_90p_autotrophic_rubisco_tree/type1_uclust_all_0.9.faa", "w") as output_handle:
    SeqIO.write(auto_seq, output_handle, "fasta")

We load the data from the old clustering results. We replace three centroids from the original file as they did not have sequences with canonical amino acids. 
* We replace gi|1143283753|pdb|5MAC|A with gi|499819577|ref|WP_011500311.1|
* We replace gi|1004829134|gb|KYC54090.1| with gi|1004818558|gb|KYC44365.1| 
* We remove gi|223601|prf||0903153A
We save the results in the file type_II.faa.sorted.0.90_processed.uc in the data folder

In [68]:
cluster_data = pd.read_csv('../data/type_II.faa.sorted.0.90_processed.uc',header=None)
cluster_data.loc[cluster_data[9] == '*',9] = cluster_data.loc[cluster_data[9] == '*',8]
cluster_data[9].nunique()
cluster_data[9] = cluster_data[9].str.replace('RBC','RBC_')
old_cluster_id = cluster_data[9].unique()


auto_seq = []
for record in SeqIO.parse('../output/00_100p_tree/merged_data_clean.faa', "fasta"):
    if record.id in [x.id for x in auto_seq]:
            continue
    if record.id in old_cluster_id:
        auto_seq.append(record)

#auto_seq = auto_seq+seqs

with open(r"../output/02_90p_autotrophic_rubisco_tree/old_cluster_II_II_III.0.9.faa", "w") as output_handle:
    SeqIO.write(auto_seq, output_handle, "fasta")

In [72]:
!cat ../output/02_90p_autotrophic_rubisco_tree/type1_uclust_all_0.9.faa ../output/02_90p_autotrophic_rubisco_tree/old_cluster_II_II_III.0.9.faa > ../output/02_90p_autotrophic_rubisco_tree/old_auto_uclust_all_0.9.faa

### Remove outliers - sequences which less than 50% of them map to Rr sequence by blast

In [73]:
from Bio.Blast.Applications import NcbiblastpCommandline
output = NcbiblastpCommandline(query="../output/02_90p_autotrophic_rubisco_tree/old_auto_uclust_all_0.9.faa", subject="../data/Rr.faa", outfmt=5)()[0]
from Bio.Blast import NCBIXML
from io import StringIO

res = []
for x in NCBIXML.parse(StringIO(output)):
    if len(x.alignments) == 0:
        res.append(0)
    else:
        alignment = x.alignments[0]
        res.append(pd.DataFrame([[x.align_length,x.expect] for x in alignment.hsps]).sort_values(by=1).loc[0,0]/x.query_length)

seq = [x for x in SeqIO.parse('../output/02_90p_autotrophic_rubisco_tree/old_auto_uclust_all_0.9.faa',format='fasta')]

res_df = pd.Series(res,index=[x.id for x in seq])

In [74]:
len(seq)

603

In [75]:
seqs = [x for x in seq if x.id in res_df[res_df >= 0.5].index]

with open(r"../output/02_90p_autotrophic_rubisco_tree/old_auto_uclust_all_0.9_no_outliers.faa", "w") as output_handle:
    SeqIO.write(seqs, output_handle, "fasta")
    
seqs2 = [x for x in seq if x.id in res_df[res_df < 0.5].index]
with open(r"../output/02_90p_autotrophic_rubisco_tree/old_auto_uclust_all_0.9_outliers.faa", "w") as output_handle:
    SeqIO.write(seqs2, output_handle, "fasta")

In [181]:
# These sequences were found in the cluster data but not in the fasta file - this is because the sequences have ambiguous AA
pd.DataFrame(old_cluster_id)[~pd.DataFrame(old_cluster_id).isin([x.id for x in auto_seq])[0]]

Unnamed: 0,0
79,gi|1143283753|pdb|5MAC|A
99,gi|1004829134|gb|KYC54090.1|
137,gi|223601|prf||0903153A


## Create multiple sequence alignment

In [81]:
#!../bin/mafft-linux64/mafft.bat ../output/02_90p_autotrophic_rubisco_tree/auto_uclust_all_0.9.faa > ../output/02_90p_autotrophic_rubisco_tree/auto_uclust_all_0.9.aln
!../bin/mafft-linux64/mafft.bat ../output/02_90p_autotrophic_rubisco_tree/old_auto_uclust_all_0.9_no_outliers.faa > ../output/02_90p_autotrophic_rubisco_tree/old_auto_uclust_all_0.9_no_outliers.aln

nthread = 0
nthreadpair = 0
nthreadtb = 0
ppenalty_ex = 0
stacksize: 8192 kb
rescale = 1
Gap Penalty = -1.53, +0.00, +0.00



Making a distance matrix ..
  601 / 601
done.

Constructing a UPGMA tree (efffree=0) ... 
  590 / 601
done.

Progressive alignment 1/2... 
STEP   474 / 600 
Reallocating..done. *alloclen = 2256
STEP   501 / 600 
done.

Making a distance matrix from msa.. 
  500 / 601
done.

Constructing a UPGMA tree (efffree=1) ... 
  590 / 601
done.

Progressive alignment 2/2... 
STEP   501 / 600  h
Reallocating..done. *alloclen = 2291

done.

disttbfast (aa) Version 7.427
alg=A, model=BLOSUM62, 1.53, -0.00, -0.00, noshift, amax=0.0
0 thread(s)


Strategy:
 FFT-NS-2 (Fast but rough)
 Progressive method (guide trees were built 2 times.)

If unsure which option to use, try 'mafft --auto input > output'.
For more information, see 'mafft --help', 'mafft --man' and the mafft page.

The default gap scoring scheme has been changed in version 7.110 (2013 Oct).
It tends to insert more g

## Clean MSA to contain only positions with more than 5% coverage (based on Jaffe et al. 2018)

In [82]:
clean_aln(infile='../output/02_90p_autotrophic_rubisco_tree/old_auto_uclust_all_0.9_no_outliers.aln',
          outfile='../output/02_90p_autotrophic_rubisco_tree/old_auto_uclust_all_0.9_no_outliers_trimmed.aln')

In [89]:
cip = CipresClient.Client(appname='RO',
                    appID='rubisco_phylogeny-49F87B124F3D429FBE12F95E4254DDEA',
                    baseUrl='https://cipresrest.sdsc.edu/cipresrest/v1',
                    username='yinonbaron',
                    password='Mchcav11~')


In [90]:
job = cip.submitJob(vParams={'toolId': 'RAXMLHPC8_REST_XSEDE',
                       'datatype_': 'protein',
                       'runtime_': '160',
                       'select_analysis_': 'fa',
                       'choose_bootstrap_': 'x',
                       'printbrlength_': '1'},
              inputParams={'infile_': '../output/02_90p_autotrophic_rubisco_tree/old_auto_uclust_all_0.9_no_outliers_trimmed.aln'},
              metadata={'statusEmail':'true'})

In [10]:
job = cip.listJobs()[-1]

In [91]:
if cip.getJobStatus(jobHandle=job.jobHandle).isDone():
    !mkdir -p ../output/02_90p_autotrophic_rubisco_tree/RaxML/
    job.downloadResults('../output/02_90p_autotrophic_rubisco_tree/RaxML/')
else:
    print('Job ' + job.jobHandle + ' not finished')

Find the matching leaf labels to the synthesized sequences

In [130]:
t1 = pd.read_csv('../output/02_90p_autotrophic_rubisco_tree/type1_uclust_all_0.9.csv')
#t1.Target.nunique()
cluster_data = pd.read_csv('../data/type_II.faa.sorted.0.90_processed.uc',header=None)
cluster_data.loc[cluster_data[9] == '*',9] = cluster_data.loc[cluster_data[9] == '*',8]

cluster_data[9] = cluster_data[9].str.replace('RBC','RBC_')
cluster_data[8] = cluster_data[8].str.replace('RBC','RBC_')
#cluster_data.columns = t1.columns
cluster_data.columns = t1.columns[1:-1]
united_data = pd.concat([cluster_data,t1.iloc[:,1:-1]])
united_data.to_csv('../output/02_90p_autotrophic_rubisco_tree/old_auto_uclust_all_0.9.csv',index=False)

In [125]:
#united_data
syn = pd.read_csv('../data/milo_synth.csv')
found_ids = pd.concat([united_data[united_data.Target.isin(syn['Int ID'])].Target,united_data[united_data.Target.isin(syn['ID'])].Target])
found_ids.drop_duplicates().to_csv('../data/old_syn_labels.csv',index=False)

  after removing the cwd from sys.path.


In [146]:
synth = pd.read_csv('../data/milo_synth.csv')
synth.isin(cluster_data[9])

mis  =synth[~synth['Int ID'].isin(cluster_data[8])]
mis[~mis.ID.isin(cluster_data[8])]
cluster_data[8] = cluster_data[8].str.replace('RBC','RBC_')

In [192]:
g = cluster_data.merge(synth,left_on=8,right_on='Int ID',how='left')
g2 = g[g['Int ID'].isna()].merge(synth,left_on=8,right_on='ID',how='left')
#g2[g2['ID_y'].isna()]
g.loc[~g['Int ID'].isna(),9].nunique()
t = g2.loc[~g2['ID_y'].isna(),9]
cluster_data[~cluster_data[9].isin(t)]

0                                             RBC_7
1             gi|363498367|gb|AAQ04822.2|AF463409_1
2                  gi|502802415|ref|WP_013037391.1|
3                  gi|759380765|ref|WP_043107373.1|
4                      gi|668346614|emb|CDW95835.1|
5                      gi|1232606110|gb|OYY45669.1|
6                       gi|954037512|gb|ALP32073.1|
8        TARA_138.SAMEA2623390.450.0.22-3_1579609_5
10       TARA_102.SAMEA2622197.480.0.22-3_1360909_4
12       TARA_102.SAMEA2622197.480.0.22-3_1360909_4
14                      gi|589604584|gb|EXI76444.1|
15                        cg1_0.2_scaffold_4987_c_3
17         TARA_137.SAMEA2623295.40.0.22-3_128859_3
18                gi|1186171677|ref|WP_085372695.1|
19                                           RBC_54
23                     gi|144900524|emb|CAM77388.1|
24        TARA_137.SAMEA2623295.40.0.22-3_1214112_2
26                 gi|760066961|ref|WP_043749806.1|
28                                           RBC_49
29          

In [194]:
lines = t.apply(lambda x: x.split(' ')[0]).values + ',-1,1\n'
unique_lines = np.unique(lines)

with open('../data/kinetic_sampling_legend.txt','r') as file:
    with open('../output/02_90p_autotrophic_rubisco_tree/synth_legend_merge', "w") as f1:
        for row in file:
            f1.write(row)
        for line in unique_lines:
            f1.write(line)
        file.close()
        f1.close()

In [144]:
#[x for x in synth['Int ID']]
cluster_data[cluster_data[9].str.contains('RBC_81')]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
420,S,124,457,*,.,*,*,*,RBC________81,RBC_81
421,H,124,457,100.0,.,0,457,=,gi|655299332|ref|WP_028708174.1|,RBC_81
664,C,124,2,*,*,*,*,*,RBC________81,RBC_81


In [6]:
d = pd.read_csv('../output/00_100p_tree/uclust_all_1_rubisco_types.csv')
d2 = pd.read_csv('../output/02_90p_autotrophic_rubisco_tree/auto_uclust_all_0.9_no_outliers.csv')
d3 = d2.merge(d, left_on='Query', right_on='ID',how='left')
d3.type.unique()

array([nan, 'I', 'IIIlike', 'II', 'II/III', 'IIIc', 'unknown', 'IIIa',
       'IIIb'], dtype=object)

In [47]:
add_type(type_file='../output/00_100p_tree/uclust_all_1_rubisco_types.csv',
         seq_file='../output/02_90p_autotrophic_rubisco_tree/auto_uclust_all_0.9_no_outliers.csv',
         outfile='../output/02_90p_autotrophic_rubisco_tree/type_legend.txt')

In [1]:
p90_clusters = pd.read_csv('../output/02_90p_autotrophic_rubisco_tree/true_uclust_all_0.9.csv')
p70_clusters = pd.read_csv('../output/01_70p_tree/uclust_all_0.7.csv')

type_data = pd.read_csv('../output/01_70p_tree/rubisco_types_70p.csv')

uclust_m = p90_clusters.merge(p70_clusters,left_on='Query',right_on='Query',suffixes=('_90','_70'))
uclust_m['ID'] = uclust_m['Target_70'].apply(lambda x: x.split(' ')[0])
uclust_mt= uclust_m.merge(type_data,left_on='ID',right_on='ID')
kin_data = pd.read_csv('../output/00_100p_tree/uclust_all_1_kinetic_data.csv')
uclust_mtk = uclust_mt.merge(kin_data,left_on='Query',right_on='kinetic_ID')

kin_data.loc[kin_data.kinetic_ID.isin(uclust_mtk[uclust_mtk.type.isin(['II','II/III'])].Query),'kinetic_ID'] = uclust_mtk[uclust_mtk.type.isin(['II','II/III'])].Target_90.apply(lambda x: x.split(' ')[1]).values
#kin_data.loc[kin_data.kinetic_ID.isin(uclust_mtk[uclust_mtk.type.isin(['II','II/III'])].Query),'kinetic_ID']
kin_data.to_csv('../output/02_90p_autotrophic_rubisco_tree/old_auto_kinetic_data.csv',index=False,header=None)


In [4]:
add_kinetic(kinetic_file='../output/02_90p_autotrophic_rubisco_tree/old_auto_kinetic_data.csv',
         synth_file='../data/old_syn_labels.csv',
         seq_file='../output/02_90p_autotrophic_rubisco_tree/old_auto_uclust_all_0.9.csv',
         outfile='../output/02_90p_autotrophic_rubisco_tree/kinetic_legend_3.txt')

In [170]:
synth_data = pd.read_csv('../data/old_syn_labels.csv',names=['syn_ID'])
kinetic_data = pd.read_csv('../output/02_90p_autotrophic_rubisco_tree/old_auto_kinetic_data.csv',names=['kinetic_ID'])
uclust_data = pd.read_csv('../output/02_90p_autotrophic_rubisco_tree/old_auto_uclust_all_0.9.csv')

uclust_data = uclust_data.merge(synth_data, left_on='Query', right_on='syn_ID',how='left')
uclust_data = uclust_data.merge(kinetic_data, left_on='Query', right_on='kinetic_ID',how='left')

In [172]:
uclust_data[uclust_data.Query.str.contains('494538')]
#kinetic_data
#kin_data

Unnamed: 0,Type,Cluster,Size,%Id,Strand,Qlo,Tlo,Alignment,Query,Target,syn_ID,kinetic_ID
43,S,24,466,*,.,*,*,*,gi|494538|pdb|1RBA|A,gi|494538|pdb|1RBA|A,gi|494538|pdb|1RBA|A,gi|494538|pdb|1RBA|A
564,C,24,7,*,*,*,*,*,gi|494538|pdb|1RBA|A,gi|494538|pdb|1RBA|A,gi|494538|pdb|1RBA|A,gi|494538|pdb|1RBA|A


In [None]:

uclust_data['kinetic_flag'] = '-1'
uclust_data['syn_flag'] = '-1'

kinetic_centroid = uclust_data.loc[~pd.isna(uclust_data['kinetic_ID']),'Target'].unique()
syn_centroid = uclust_data.loc[~pd.isna(uclust_data['syn_ID']),'Target'].unique()

uclust_data.loc[uclust_data['Target'].isin(kinetic_centroid),'kinetic_flag'] = '1'
uclust_data.loc[uclust_data['Target'].isin(syn_centroid),'syn_flag'] = '1'

lines = uclust_data['Target'].apply(lambda x: x.split(' ')[0]).values + ','+ uclust_data['kinetic_flag'].values+','+uclust_data['syn_flag'].values+'\n'
unique_lines = np.unique(lines)

with open('../data/kinetic_sampling_legend.txt','r') as file:
    with open(outfile, "w") as f1:
        for row in file:
            f1.write(row)
        for line in unique_lines:
            f1.write(line)
        file.close()
        f1.close()

In [2]:
add_kinetic_on_label(kinetic_file='../output/00_100p_tree/uclust_all_1_kinetic_data.csv',
         synth_file='../output/00_100p_tree/synth_data.csv',
         seq_file='../output/02_90p_autotrophic_rubisco_tree/auto_uclust_all_0.9_no_outliers.csv',
         outfile='../output/02_90p_autotrophic_rubisco_tree/kinetic_legend_label.txt')


['gi|604722319|dbj|BAO57366.1|,label,node,#B2BABB,1,normal\n'
 'gi|604722319|dbj|BAO57366.1|,label,node,#B2BABB,1,normal\n'
 'gi|604722319|dbj|BAO57366.1|,label,node,#B2BABB,1,normal\n' ...
 'gi|576253|pdb|1RBL|A,label,node,#B2BABB,1,normal\n'
 'gi|157678845|dbj|BAF80663.1|,label,node,#B2BABB,1,normal\n'
 'RBC2_50,label,node,#B2BABB,1,normal\n']


In [23]:
k = pd.read_csv('../output/00_100p_tree/uclust_all_1_kinetic_data.csv')
k

Unnamed: 0,kinetic_ID
0,RBC2_50 gi|499819577|ref|WP_011500311.1|
1,gi|499561661|ref|WP_011242444.1| MULTISPECIES:...
2,gi|499709017|ref|WP_011389751.1| ribulose-bisp...
3,gi|502736740|ref|WP_012971724.1| ribulose bisp...
4,"gi|356472757|gb|AET10441.1| ribulose-1,5-bisph..."
5,"gi|11467200|ref|NP_043033.1| ribulose-1,5-bisp..."
6,"gi|14017580|ref|NP_114267.1| ribulose-1,5-bisp..."
7,"gi|11497536|ref|NP_054944.1| ribulose-1,5-bisp..."
8,"gi|1248646408|gb|ATG33856.1| ribulose-1,5-bisp..."
9,gi|157678845|dbj|BAF80663.1| ribulose 1.5-bisp...


In [10]:
uclust = rubisco_types = pd.read_csv('../output/01_70p_tree/rubisco_types_70p.csv')
uclust70 = pd.read_csv('../output/01_70p_tree/uclust_all_0.7.csv')
rubisco_types = pd.read_csv('../output/01_70p_tree/rubisco_types_70p.csv')
#uclust[uclust.Query.str.contains('499819577')]
#uclust70[uclust70.Query.str.contains('499819577')]
uclust70

Unnamed: 0,Type,Cluster,Size,%Id,Strand,Qlo,Tlo,Alignment,Query,Target
0,H,1,486,74.2,.,0,0,2I472M14D,RBCSeed_18 gi|563352309|gb|AHB41464.1|,RBCSeed_14 gi|502802415|ref|WP_013037391.1|
1,H,2,464,72.6,.,0,0,I82M3I379M3D,RBCSeed_24 gi|759380765|ref|WP_043107373.1|,RBCSeed_25 gi|771607269|ref|WP_045218222.1|
2,H,2,449,74.1,.,0,0,3I72M3I377M10I,RBC_7 gi|544702516|ref|WP_021133537.1|,RBCSeed_25 gi|771607269|ref|WP_045218222.1|
3,H,2,459,71.0,.,0,0,3I73M3I386M,RBC2_42 gi|1057024209|ref|WP_068435278.1|,RBCSeed_25 gi|771607269|ref|WP_045218222.1|
4,H,2,459,74.9,.,0,0,3I79M3I380M,RBC2_44 gi|1062686077|ref|WP_069331322.1|,RBCSeed_25 gi|771607269|ref|WP_045218222.1|
5,H,4,461,87.6,.,0,0,2D459M,RBC_94 gi|983348972|ref|WP_060528669.1|,RBC3_1 WP_012823967.1
6,H,2,461,71.7,.,0,0,3I38MD34M3I387MD,RBC_33 gi|499759116|ref|WP_011439850.1|,RBCSeed_25 gi|771607269|ref|WP_045218222.1|
7,H,2,459,74.9,.,0,0,3I79M3I380M,RBC_3 gi|499658436|ref|WP_011339170.1|,RBCSeed_25 gi|771607269|ref|WP_045218222.1|
8,H,6,444,80.9,.,0,0,444M,RBC_32 gi|851283698|ref|WP_048147752.1|,Tk gi|499570457|ref|WP_011251240.1|
9,H,4,459,80.6,.,0,0,459M,RBC_73 gi|499689111|ref|WP_011369845.1|,RBC3_1 WP_012823967.1
