In [1]:
## Align genomes with mauve
## Extract orthologs with ID 35% Coverage 51%

In [2]:
import pandas as pd
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
import numpy as np
import os
from os import listdir,walk
from os.path import isfile, join

In [18]:
######## read and parse ortholog file #########
genomes=['SUP05','Bathy','R.magnifica','R.fausta','R.pacifica','R.phaseoliformis','R.pliocardia','R.rectimargo','R.southwardae','V.diagonalis','V.extenta','V.soyoae1','V.soyoae2','V.gigas1','V.gigas2','V.okutanii','V.marissinica']
locus_tag_prefixes=['MS2017','SP60','HUE58','Rmag','Rpac','Rpha','Rpli','Rrec','Rsou','Vdia','Vext','Vsoy1','Vsoy2','Vgig1','Vgig2','COSY','Vmar']
#pfx=dict(zip(locus_tag_prefixes,genomes))
idx_dict=dict(zip(range(len(genomes)),genomes))

ALL_raw=pd.read_csv('ALL_id35cov51.orthologs',sep='\t',header=None,names=genomes)
ALL=pd.DataFrame([],columns=genomes, index=ALL_raw.index)

for idx in ALL_raw.index:
    for col in list(ALL_raw):
        if type(ALL_raw.loc[idx][col])!=float:
            column=ALL_raw.loc[idx][col].split(':')[0]
            value=ALL_raw.loc[idx][col].split(':')[1]
            ALL.loc[idx][idx_dict[int(column)]]=value
            



In [24]:
## remove pseudogenes
goodfiles={'Bathy':'../genomes/Bathy_complete_with_CRISPRannot.gbk',\
'R.fausta':'../genomes/R.fausta.gbk',\
'R.magnifica':'../genomes/R.magnifica.gbk',\
'R.pacifica':'../genomes/R.pacifica_one_contig_circular_fully_annotated.gbk',\
'R.phaseoliformis':'../genomes/R.phaseoliformis_8_contigs_fully_annotated.gbk',\
'R.pliocardia':'../genomes/R.pliocardia_one_contig_circular_fully_annotated.gbk',\
'R.rectimargo':'../genomes/R.rectimargo_one_contig_circular_fully_annotated.gbk',\
'R.southwardae':'../genomes/R.southwardae_39_contigs_fully_annotated.gbk',\
'SUP05':'../genomes/SUP05.gbk',\
'V.diagonalis':'../genomes/V.diagonalis_one_contig_circular_fully_annotated.gbk',\
'V.extenta':'../genomes/V.extenta_one_contig_circular_fully_annotated.gbk',\
'V.gigas1':'../genomes/V.gigas1_one_contig_circular_fully_annotated.gbk',\
'V.gigas2':'../genomes/V.gigas2_one_contig_circular_fully_annotated.gbk',\
'V.marissinica':'../genomes/V.marissinica_withlocustags.gbk',\
'V.okutanii':'../genomes/V.okutanii_with_locus_tags.gbk',\
'V.soyoae1':'../genomes/V.soyoae1_one_contig_circular_fully_annotated.gbk',\
'V.soyoae2':'../genomes/V.soyoae2_one_contig_circular_fully_annotated.gbk'}

pseudogenes=[]
for sample in goodfiles.keys():
    for record in SeqIO.parse(goodfiles[sample],'genbank'):
            for feature in record.features:
                if feature.type=='CDS':
                    locus_tag = feature.qualifiers['locus_tag'][0]
                    if 'pseudo' in feature.qualifiers: #verify if pseudogene
                        pseudogenes+=[locus_tag]
                        
print(len(pseudogenes))


ALL[ALL.isin(pseudogenes)] = np.nan
    
    
ALL['Count']=ALL[genomes].count(axis=1)
ALL.to_csv('Orthology_id35cov51.txt',header=True,sep='\t')  

CORE=ALL[ALL['Count']==17][genomes]
CORE.to_csv('CoreOrthology_id35cov51.txt',header=True,sep='\t') 

95


In [27]:
# pseudogenes
print(len(CORE))
print(len(ALL))

739
6084


In [28]:
### Prep input files for Venn diagram ###
#http://www.interactivenn.net/


# df=pd.read_csv('Orthology_id35cov51',sep='\t',header=0)
df=ALL.copy()

df['gene_number']=df.index.values
# print( df.columns)
dic={}
dic['FL']=['Bathy','SUP05']
dic['Ruthia']=['R.magnifica','R.fausta', 'R.pacifica','R.phaseoliformis', 'R.pliocardia', 'R.rectimargo', 'R.southwardae']
dic['Gigas']=['V.diagonalis', 'V.extenta', 'V.soyoae1','V.soyoae2', 'V.gigas1','V.gigas2','V.okutanii','V.marissinica']

fa=open('input_ivenn_id35cov51.txt','w')
fa.close()

for k,v in dic.items():
    d=df[['gene_number']+v]

    pan=df[v].dropna(how='all',axis=0).index
    pan_entry=k+':'+','.join([str(v) for v in pan])+';\n'

    core=df[v].dropna(how='any',axis=0).index
    core_entry=k+'_core'+':'+','.join([str(v) for v in core])+';\n'

    fa=open('input_ivenn_id35cov51.txt','a')
    fa.write(pan_entry)
    fa.write(core_entry)
    fa.close()




In [29]:
## Parse ivenn data
with open('ivenn_id35cov51.txt') as f :
    lines=f.read().splitlines()
    lines=[[line.split(': ')[0],len(line.split(': ')[1].split(','))] for line in lines]
lines
tot_FL=sum([category[1] for category in lines if 'FL' in category[0]])
tot_Ruthia=sum([category[1] for category in lines if 'Ruthia' in category[0]])
tot_Gigas=sum([category[1] for category in lines if 'Gigas' in category[0]])
print('tot_FL = '+str(tot_FL),'tot_Ruthia = '+str(tot_Ruthia),'tot_Gigas = '+str(tot_Gigas))
lines

tot_FL = 2805 tot_Ruthia = 4219 tot_Gigas = 1244


[['[FL]', 1593],
 ['[Ruthia]', 2966],
 ['[Gigas]', 229],
 ['[Ruthia] and [Gigas]', 36],
 ['[Gigas] and [Gigas_core]', 6],
 ['[FL] and [FL_core]', 33],
 ['[FL] and [Ruthia]', 112],
 ['[Ruthia] and [Ruthia_core]', 6],
 ['[FL] and [Gigas] and [Gigas_core]', 3],
 ['[FL] and [FL_core] and [Gigas] and [Gigas_core]', 1],
 ['[FL] and [Ruthia] and [Gigas]', 4],
 ['[FL] and [FL_core] and [Ruthia] and [Gigas]', 16],
 ['[FL] and [Ruthia] and [Gigas] and [Gigas_core]', 21],
 ['[FL] and [FL_core] and [Ruthia] and [Gigas] and [Gigas_core]', 59],
 ['[FL] and [Ruthia] and [Ruthia_core] and [Gigas]', 4],
 ['[FL] and [FL_core] and [Ruthia] and [Ruthia_core] and [Gigas]', 28],
 ['[FL] and [FL_core] and [Ruthia] and [Ruthia_core] and [Gigas] and [Gigas_core]',
  747],
 ['[FL] and [Ruthia] and [Ruthia_core] and [Gigas] and [Gigas_core]', 54],
 ['[Ruthia] and [Ruthia_core] and [Gigas] and [Gigas_core]', 18],
 ['[Ruthia] and [Gigas] and [Gigas_core]', 18],
 ['[FL] and [Ruthia] and [Ruthia_core]', 11],
 ['[FL]

In [6]:
## extract core sequences and align them
########### CORE: retreive sequences from GBK, align aa, backtranslate and output fna; 715 sequences ##########
from Bio import AlignIO
from Bio.codonalign import build
from Bio.Alphabet import Gapped,IUPAC
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Align import MultipleSeqAlignment
from Bio.Alphabet import generic_dna, generic_protein
from Bio.Align.Applications import MuscleCommandline,ClustalwCommandline
import subprocess
import sys

# os.mkdir('./fna')
# os.mkdir('./faa')


goodfiles={'Bathy':'../genomes/Bathy_complete_with_CRISPRannot.gbk',\
'R.fausta':'../genomes/R.fausta.gbk',\
'R.magnifica':'../genomes/R.magnifica.gbk',\
'R.pacifica':'../genomes/R.pacifica_one_contig_circular_fully_annotated.gbk',\
'R.phaseoliformis':'../genomes/R.phaseoliformis_8_contigs_fully_annotated.gbk',\
'R.pliocardia':'../genomes/R.pliocardia_one_contig_circular_fully_annotated.gbk',\
'R.rectimargo':'../genomes/R.rectimargo_one_contig_circular_fully_annotated.gbk',\
'R.southwardae':'../genomes/R.southwardae_39_contigs_fully_annotated.gbk',\
'SUP05':'../genomes/SUP05.gbk',\
'V.diagonalis':'../genomes/V.diagonalis_one_contig_circular_fully_annotated.gbk',\
'V.extenta':'../genomes/V.extenta_one_contig_circular_fully_annotated.gbk',\
'V.gigas1':'../genomes/V.gigas1_one_contig_circular_fully_annotated.gbk',\
'V.gigas2':'../genomes/V.gigas2_one_contig_circular_fully_annotated.gbk',\
'V.marissinica':'../genomes/V.marissinica_withlocustags.gbk',\
'V.okutanii':'../genomes/V.okutanii_with_locus_tags.gbk',\
'V.soyoae1':'../genomes/V.soyoae1_one_contig_circular_fully_annotated.gbk',\
'V.soyoae2':'../genomes/V.soyoae2_one_contig_circular_fully_annotated.gbk'}

CORE=pd.read_csv('CoreOrthology_id35cov51.txt',header=0,sep='\t',index_col=0)

# for i in range(688,len(CORE)):
for i in [81,280,525,526,533,687]:
    gene=CORE.iloc[i]
    gene_name=CORE.iloc[i]['R.magnifica']
    if gene_name.split('_')[1][0]=='R':
        continue
    fna=[]
    faa=[]
    print(i,gene_name)
    for sample in sorted(gene.index):
        for record in SeqIO.parse(goodfiles[sample],'genbank'):
            for feature in record.features:
                if feature.type=='CDS':
                    locus_tag = feature.qualifiers['locus_tag'][0]
                    if locus_tag == CORE.iloc[i][sample]:
#                         if locus_tag == gene_name:
#                             product= feature.qualifiers['product'][0]
#                             print gene_name, product
#                             df=df.append({'gene':gene_name,'product':product}, ignore_index=True)
                        
                        
                        ### create SeqRecord object ####
                        DNAseq=feature.location.extract(record).seq
                        if locus_tag == gene_name:
                            print(len(DNAseq))
                        try:
                            AAseq=Seq(feature.qualifiers['translation'][0],alphabet=Gapped(IUPAC.extended_protein))
                        except KeyError:
                            print(feature)
#                             AAseq=feature.extract(record).translate(11)
#                             AAseq.id=locus_tag
#                             print(AAseq.seq)
                        header='|'.join([sample,CORE.iloc[i][sample]])

                        fna_seq = SeqRecord(DNAseq,id=header)
                        faa_seq = SeqRecord(AAseq,id=header)
                        
                        fna.append(fna_seq)
                        faa.append(faa_seq)                  

    faa=[f for f in sorted(faa, key=lambda x : x.id)]
    SeqIO.write(faa, 'faa/'+gene_name+'.faa', "fasta")
    
#     muscle_cline = MuscleCommandline('/cvmfs/soft.computecanada.ca/easybuild/software/2017/avx512/Compiler/gcc7.3/muscle/3.8.31/bin/muscle',input='fna/'+gene_name+'.fasta', out='fna/aligned_'+gene_name+'.faa')
#     muscle_cline()
#     align = AlignIO.read('fna/aligned_'+gene_name+'.faa', "fasta",alphabet=Gapped(IUPAC.extended_protein))

#     stdout, stderr = muscle_cline()
#     align = AlignIO.read(StringIO(stdout), "fasta")
#     print(align)
    ### run MUSCLE ####
    muscle_cline = MuscleCommandline(clwstrict=True)
    child = subprocess.Popen(str(muscle_cline),
                             stdin=subprocess.PIPE,
                             stdout=subprocess.PIPE,
                             stderr=subprocess.PIPE,
                             universal_newlines=True,
                             shell=(sys.platform!="win32"))
    SeqIO.write(faa, child.stdin, "fasta")
    child.stdin.close()
    align = AlignIO.read(child.stdout, "clustal",alphabet=Gapped(IUPAC.extended_protein))
    aln = MultipleSeqAlignment([f for f in sorted(align, key=lambda x : x.id)])
    codon_aln = build(aln, fna)

    SeqIO.write(aln, 'faa/aligned_'+gene_name+'.faa', "fasta")
    SeqIO.write(codon_aln, 'fna/aligned_'+gene_name+'.fna', "fasta")

81 Rmag_0392
3825


KeyboardInterrupt: 

In [None]:
# 81 Rmag_0392 issue will have to rerun
# 280 Rmag_1021 issue will have to rerun
# 525 Rmag_0811
# 526 Rmag_0810
# 533 Rmag_0803
# 687 Rmag_0544
[81,280,525,526,533,687]

In [7]:
## extract core sequences and align them
########### CORE: retreive sequences from GBK, align aa, backtranslate and output fna; 715 sequences ##########
from Bio import AlignIO
from Bio.codonalign import build
from Bio.Alphabet import Gapped,IUPAC
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Align import MultipleSeqAlignment
from Bio.Alphabet import generic_dna, generic_protein
from Bio.Align.Applications import MuscleCommandline,ClustalwCommandline
import subprocess
import sys

# os.mkdir('./fna')
# os.mkdir('./faa')


goodfiles={'Bathy':'../genomes/Bathy_complete_with_CRISPRannot.gbk',\
'R.fausta':'../genomes/R.fausta.gbk',\
'R.magnifica':'../genomes/R.magnifica.gbk',\
'R.pacifica':'../genomes/R.pacifica_one_contig_circular_fully_annotated.gbk',\
'R.phaseoliformis':'../genomes/R.phaseoliformis_8_contigs_fully_annotated.gbk',\
'R.pliocardia':'../genomes/R.pliocardia_one_contig_circular_fully_annotated.gbk',\
'R.rectimargo':'../genomes/R.rectimargo_one_contig_circular_fully_annotated.gbk',\
'R.southwardae':'../genomes/R.southwardae_39_contigs_fully_annotated.gbk',\
'SUP05':'../genomes/SUP05.gbk',\
'V.diagonalis':'../genomes/V.diagonalis_one_contig_circular_fully_annotated.gbk',\
'V.extenta':'../genomes/V.extenta_one_contig_circular_fully_annotated.gbk',\
'V.gigas1':'../genomes/V.gigas1_one_contig_circular_fully_annotated.gbk',\
'V.gigas2':'../genomes/V.gigas2_one_contig_circular_fully_annotated.gbk',\
'V.marissinica':'../genomes/V.marissinica_withlocustags.gbk',\
'V.okutanii':'../genomes/V.okutanii_with_locus_tags.gbk',\
'V.soyoae1':'../genomes/V.soyoae1_one_contig_circular_fully_annotated.gbk',\
'V.soyoae2':'../genomes/V.soyoae2_one_contig_circular_fully_annotated.gbk'}

CORE=pd.read_csv('CoreOrthology_id35cov51.txt',header=0,sep='\t',index_col=0)

# for i in range(688,len(CORE)):
for i in [81,280,525,526,533,687]:
    gene=CORE.iloc[i]
    gene_name=CORE.iloc[i]['R.magnifica']
    if gene_name.split('_')[1][0]=='R':
        continue
    fna=[]
    faa=[]
    print(i,gene_name)
    for sample in sorted(gene.index):
        for record in SeqIO.parse(goodfiles[sample],'genbank'):
            for feature in record.features:
                if feature.type=='CDS':
                    locus_tag = feature.qualifiers['locus_tag'][0]
                    if locus_tag == CORE.iloc[i][sample]:
#                         if locus_tag == gene_name:
#                             product= feature.qualifiers['product'][0]
#                             print gene_name, product
#                             df=df.append({'gene':gene_name,'product':product}, ignore_index=True)
                        
                        
                        ### create SeqRecord object ####
                        DNAseq=feature.location.extract(record).seq
                        if locus_tag == gene_name:
                            print(len(DNAseq))
                        try:
                            AAseq=Seq(feature.qualifiers['translation'][0],alphabet=Gapped(IUPAC.extended_protein))
                        except KeyError:
                            print(feature)
#                             AAseq=feature.extract(record).translate(11)
#                             AAseq.id=locus_tag
#                             print(AAseq.seq)
                        header='|'.join([sample,CORE.iloc[i][sample]])

                        fna_seq = SeqRecord(DNAseq,id=header)
                        faa_seq = SeqRecord(AAseq,id=header)
                        
                        fna.append(fna_seq)
                        faa.append(faa_seq)                  

    faa=[f for f in sorted(faa, key=lambda x : x.id)]
    SeqIO.write(faa, 'faa/'+gene_name+'.faa', "fasta")
    


81 Rmag_0392
3825
280 Rmag_1021
3429
525 Rmag_0811
4083
526 Rmag_0810
4188
533 Rmag_0803
3435
687 Rmag_0544
3405


In [497]:
# for i in range(688,len(CORE)):
for i in [81,280,525,526,533,687]:
    gene=CORE.iloc[i]
    gene_name=CORE.iloc[i]['R.magnifica']
    
    align = AlignIO.read(child.stdout, "clustal",alphabet=Gapped(IUPAC.extended_protein))
    aln = MultipleSeqAlignment([f for f in sorted(align, key=lambda x : x.id)])
    codon_aln = build(aln, fna)

    SeqIO.write(codon_aln, 'fna/aligned_'+gene_name+'.fna', "fasta")

In [15]:
# in ALL core not in previous orthology
ORTH=pd.read_csv('Ortho_all_28052020.txt',sep='\t')
ALL=pd.read_csv('Orthology_id35cov51.txt',sep='\t')
# print(ALL[:5])
ALL_only=[l for l in ALL[ALL['Count']==17]['R.magnifica'].values if l not in ORTH['R.magnifica'].values]
ALL_only
print(len(ALL_only))

lines_only=[l for l in ORTH[ORTH['genome_count']==15]['R.magnifica'].values if l not in ALL[ALL['Count']==17]['R.magnifica'].values]
print(len(lines_only))
lines_only
# ALL_only

# @id70: 193 core genes missing
# @id60: 39 core genes missing 
# @id45: 34 core genes missing
# @id35: 23 core genes missing
# 4 present in ALL_only because pseudogenes if R.magnifica; have to be kicked out

4
31


['Rmag_0004',
 'Rmag_0019',
 'Rmag_0022',
 'Rmag_0023',
 'Rmag_1057',
 'Rmag_0235',
 'Rmag_0238',
 'Rmag_0785',
 'Rmag_0701',
 'Rmag_0699',
 'Rmag_0446',
 'Rmag_0624',
 'Rmag_0623',
 'Rmag_0600',
 'Rmag_0570',
 'Rmag_0562',
 'Rmag_0537',
 'Rmag_0483',
 'Rmag_0420',
 'Rmag_0337',
 'Rmag_0304',
 'Rmag_0303',
 'Rmag_0909',
 'Rmag_0162',
 'Rmag_0158',
 'Rmag_0145',
 'Rmag_0074',
 'Rmag_0058',
 'Rmag_1039',
 'Rmag_R0022',
 'Rmag_1053']

In [16]:
discr=ORTH[ORTH['R.magnifica'].isin(lines_only)].sort_values(['R.magnifica'])
discr=discr[['Bathy', 'SUP05', 'R.magnifica', 'R.pacifica', 'R.phaseoliformis',
       'R.pleocardia', 'R.rectimargo', 'R.southwardae', 'V.diagonalis',
       'V.extenta', 'V.okutanii', 'V.gigas2', 'V.soyoae2', 'V.gigas1',
       'V.soyoae1']].values.flatten()
# print(discr)
# ALL[ALL.isin(discr)]

ALL[ALL.apply(lambda r: r.isin(discr).any(), axis=1)] 

## To do possibly
#remove rows from ALL
# append rows from ORTH

Unnamed: 0.1,Unnamed: 0,SUP05,Bathy,R.magnifica,R.fausta,R.pacifica,R.phaseoliformis,R.pliocardia,R.rectimargo,R.southwardae,V.diagonalis,V.extenta,V.soyoae1,V.soyoae2,V.gigas1,V.gigas2,V.okutanii,V.marissinica,Count
32,32,SP60_00260,MS2017_1241,,,,,,,,,,,,,,,,2
72,72,SP60_00510,MS2017_0872,Rmag_0446,,Rpac_peg_984,Rpha_peg_396,Rpli_peg_1307,Rrec_peg_1072,Rsou_peg_1404,Vdia_peg_563,Vext_peg_551,Vsoy1_peg_439,Vsoy2_peg_638,Vgig1_peg_520,Vgig2_peg_407,COSY_0412,Vmar_0415,16
94,94,SP60_00760,MS2017_1396,Rmag_0420,,Rpac_peg_1416,Rpha_peg_461,Rpli_peg_1647,Rrec_peg_1026,Rsou_peg_632,Vdia_peg_590,Vext_peg_578,Vsoy1_peg_415,Vsoy2_peg_615,Vgig1_peg_546,Vgig2_peg_431,COSY_0389,Vmar_0387,16
193,193,SP60_01530,MS2017_1654,Rmag_0303,,Rpac_peg_459,Rpha_peg_1729,Rpli_peg_201,Rrec_peg_134,Rsou_peg_1698,Vdia_peg_699,Vext_peg_688,Vsoy1_peg_311,Vsoy2_peg_515,Vgig1_peg_644,Vgig2_peg_531,COSY_0285,Vmar_0285,16
403,403,SP60_03800,MS2017_2127,Rmag_1053,HUE58_RS05470,Rpac_peg_860,Rpha_peg_916,Rpli_peg_1108,Rrec_peg_666,Rsou_peg_1601,Vdia_peg_1017,Vext_peg_997,Vsoy1_peg_992,,,,,,12
430,430,SP60_04035,MS2017_0005,,,,,,,,,,,,,,,,2
444,444,SP60_04115,MS2017_0034,Rmag_0019,,Rpac_peg_210,Rpha_peg_1048,Rpli_peg_1569,Rrec_peg_722,Rsou_peg_1777,Vdia_peg_966,Vext_peg_954,Vsoy1_peg_44,Vsoy2_peg_248,Vgig1_peg_28,Vgig2_peg_940,COSY_0018,Vmar_0020,16
447,447,SP60_04130,MS2017_0038,Rmag_0022,,Rpac_peg_207,Rpha_peg_1051,Rpli_peg_1566,Rrec_peg_725,Rsou_peg_468,Vdia_peg_963,Vext_peg_951,Vsoy1_peg_47,Vsoy2_peg_251,Vgig1_peg_31,Vgig2_peg_937,COSY_0021,Vmar_0023,16
552,552,SP60_05115,MS2017_1882,,,,,,,,,,,,,,,,2
566,566,SP60_05225,MS2017_1854,Rmag_0162,,Rpac_peg_3,Rpha_peg_1995,Rpli_peg_1340,Rrec_peg_923,Rsou_peg_1966,Vdia_peg_819,Vext_peg_808,Vsoy1_peg_187,Vsoy2_peg_392,Vgig1_peg_170,Vgig2_peg_60,COSY_0166,Vmar_0162,16


In [17]:
x=ALL[ALL['Count']<17][['Bathy', 'SUP05', 'R.magnifica', 'R.pacifica', 'R.phaseoliformis',
       'R.pliocardia', 'R.rectimargo', 'R.southwardae', 'V.diagonalis',
       'V.extenta', 'V.okutanii', 'V.gigas2', 'V.soyoae2', 'V.gigas1',
       'V.soyoae1']].values
x=ALL[['Bathy', 'SUP05', 'R.magnifica', 'R.pacifica', 'R.phaseoliformis',
       'R.pliocardia', 'R.rectimargo', 'R.southwardae', 'V.diagonalis',
       'V.extenta', 'V.okutanii', 'V.gigas2', 'V.soyoae2', 'V.gigas1',
       'V.soyoae1']].values
all_str=['-'.join(map(str,a)) for a in x]

x=ORTH[ORTH['genome_count']<15][['Bathy', 'SUP05', 'R.magnifica', 'R.pacifica', 'R.phaseoliformis',
       'R.pleocardia', 'R.rectimargo', 'R.southwardae', 'V.diagonalis',
       'V.extenta', 'V.okutanii', 'V.gigas2', 'V.soyoae2', 'V.gigas1',
       'V.soyoae1']].values
x=ORTH[['Bathy', 'SUP05', 'R.magnifica', 'R.pacifica', 'R.phaseoliformis',
       'R.pleocardia', 'R.rectimargo', 'R.southwardae', 'V.diagonalis',
       'V.extenta', 'V.okutanii', 'V.gigas2', 'V.soyoae2', 'V.gigas1',
       'V.soyoae1']].values


orth_str=['-'.join(map(str,a)).replace('Rple','Rpli') for a in x]

print(len(set(orth_str)-set(all_str)))
print(len(set(all_str)-set(orth_str)))
print(len(set.intersection(set(all_str), set(orth_str))))
print(len(orth_str),len(all_str))

set(all_str)-set(orth_str)
ORTH[ORTH['Bathy']=='MS2017_0003']

985
818
5176
6163 6084


Unnamed: 0,Bathy,SUP05,R.magnifica,R.pacifica,R.phaseoliformis,R.pleocardia,R.rectimargo,R.southwardae,V.diagonalis,V.extenta,V.okutanii,V.gigas2,V.soyoae2,V.gigas1,V.soyoae1,product,genome_count
4548,MS2017_0003,,,,,,,,,,,,,,,recombination protein F,1


In [266]:
######## create dictionnaries locus_tag -> db_xref, locus_tag -> product  #########


assemblies=['R.pacifica.gbk', 'R.phaseoliformis.gbk', 'R.pleocardia.gbk', 'R.rectimargo.gbk', 'R.southwardae.gbk', 'R.kilmeri.gbk','V.extenta.gbk','V.gigas.gbk','V.diagonalis.gbk']

mypath='/Users/maeperez/Desktop/SYMBIONT_GENOMES/Clams_symb/'
onlyfiles = [mypath+f for f in listdir(mypath) if f[-3:]=='gbk' and f.split('/')[-1] in assemblies]
print '\n'.join(onlyfiles)

db_xref_dic={}
product_dic={}
for file in onlyfiles:
    sample=file.split('/')[-1][:-4]
    db_xref_dic[sample]={}
    product_dic[sample]={}
    for record in SeqIO.parse(file,'genbank'):
        for feature in record.features:
            if feature.type=='CDS':
                locus_tag = feature.qualifiers['locus_tag'][0]
                if locus_tag =='Rsou_peg_980':
                    print feature.location.extract(record).seq
                db_xref = feature.qualifiers['db_xref'][0]
                product = feature.qualifiers['product'][0]
                
                db_xref_dic[sample][locus_tag]=db_xref
                product_dic[sample][locus_tag]=product
                

print product_dic['R.pacifica']['Rpac_peg_1059']   

##### create new df with db_xref and products corresponding to the locus_tags  #########

orth_dbxref=orth.replace(db_xref_dic)
orth_prod=orth.replace(product_dic)
print orth[:4]
print orth_prod[:4]

/Users/maeperez/Desktop/SYMBIONT_GENOMES/Clams_symb/R.pacifica.gbk
/Users/maeperez/Desktop/SYMBIONT_GENOMES/Clams_symb/R.phaseoliformis.gbk
/Users/maeperez/Desktop/SYMBIONT_GENOMES/Clams_symb/R.pleocardia.gbk
/Users/maeperez/Desktop/SYMBIONT_GENOMES/Clams_symb/R.rectimargo.gbk
/Users/maeperez/Desktop/SYMBIONT_GENOMES/Clams_symb/V.diagonalis.gbk
/Users/maeperez/Desktop/SYMBIONT_GENOMES/Clams_symb/V.gigas.gbk
/Users/maeperez/Desktop/SYMBIONT_GENOMES/Clams_symb/R.southwardae.gbk
/Users/maeperez/Desktop/SYMBIONT_GENOMES/Clams_symb/R.kilmeri.gbk
/Users/maeperez/Desktop/SYMBIONT_GENOMES/Clams_symb/V.extenta.gbk
ATGATGATGACTGACACACCACTGGTAATTGCAGGAAAAACTTATCACTCTCGCTTGCTGGTTGGCTCAGGAAAATACAAAGATTTAACGCAAACAAAACTAGCAACTGAAGCCGCTCAGGCTGATATTATTACCGTTGCCATTCGTCGAACCAACATCGGACAAGACAAAAACGAGCCAAATTTATTAGACGTTATCAGCCTCGATAAATACACTATTTTGCCTAATACTGCAGGTTGCTACACCGCTAAAGATGCCGTACGTACGTGCCAATTAGCACGTGAACTTTTAGGCGGACATAATTTGGTTAAATTAGAAGTATTAGGCGATGAAAAAATCCTATACCCCAATATTGTTGAAACCCTATCTGCTGCACAAACCCTAGTT