In [1]:
from Bio import SeqIO
import glob
import re
import pandas as pd

In [2]:
files = glob.glob('Organism_FASTA_Files/*.fasta')
print(len(files))
files

20


['Organism_FASTA_Files\\Acidiphilium_cryptum.fasta',
 'Organism_FASTA_Files\\Bacillus_cereus.fasta',
 'Organism_FASTA_Files\\Bacillus_subtilis_168.fasta',
 'Organism_FASTA_Files\\Bacteroides_fragilis.fasta',
 'Organism_FASTA_Files\\Bacteroides_thetaiotaomicron.fasta',
 'Organism_FASTA_Files\\Cellulophaga_baltica_18.fasta',
 'Organism_FASTA_Files\\Cellvibrio_gilvus.fasta',
 'Organism_FASTA_Files\\Chryseobacterium_indologenes2.fasta',
 'Organism_FASTA_Files\\Citrobacter_freundii.fasta',
 'Organism_FASTA_Files\\Coprococcus_comes.fasta',
 'Organism_FASTA_Files\\Cupriavidus_necator.fasta',
 'Organism_FASTA_Files\\Delftia_acidovorans.fasta',
 'Organism_FASTA_Files\\Mycobacterium_smegmatis.fasta',
 'Organism_FASTA_Files\\Paracoccus_dentrificans.fasta',
 'Organism_FASTA_Files\\Rhizobium_radiobacte.fasta',
 'Organism_FASTA_Files\\Rhodopseudomonas_palustris.fasta',
 'Organism_FASTA_Files\\Ruminococcus_gnavus.fasta',
 'Organism_FASTA_Files\\Stigmatella_aurantiaca.fasta',
 'Organism_FASTA_Files\\S

In [3]:
outfile = open('organism_genes.csv', 'w')
outfile.write('Organism,Gene,Sequence\n')
for file in files:
    organism = re.search('(?<=\\\).*(?=\.fasta)', file)
    organism = organism.group()
    print(organism)
    genes = []
    multiple_copies = {}
    for seq_record in SeqIO.parse(file, 'fasta'):
        description = seq_record.description
        gene_name = re.search('(?<=GN=)\S*', description)
        if gene_name:
            gene_name = gene_name.group() 
            if gene_name in genes:
                if gene_name in multiple_copies:
                    multiple_copies[gene_name] = multiple_copies[gene_name] + 1
                else:
                    multiple_copies[gene_name] = 1
                gene_name = str(gene_name + ' alt version ' + str(multiple_copies[gene_name]))
            else:
                genes.append(gene_name)
            outfile.write(str(organism + ',' + gene_name + ',' + seq_record.seq +'\n'))
outfile.close()     
            
        

Acidiphilium_cryptum
Bacillus_cereus
Bacillus_subtilis_168
Bacteroides_fragilis
Bacteroides_thetaiotaomicron
Cellulophaga_baltica_18
Cellvibrio_gilvus
Chryseobacterium_indologenes2
Citrobacter_freundii
Coprococcus_comes
Cupriavidus_necator
Delftia_acidovorans
Mycobacterium_smegmatis
Paracoccus_dentrificans
Rhizobium_radiobacte
Rhodopseudomonas_palustris
Ruminococcus_gnavus
Stigmatella_aurantiaca
Streptomyces griseorubens
Sulfobacillus _hermosulfidooxidans


In [4]:
organism_genes = pd.read_csv('organism_genes.csv')
organism_genes

Unnamed: 0,Organism,Gene,Sequence
0,Acidiphilium_cryptum,nuoN2,MTAPAFTAKAFAAFAPFTLLGAVTILVMLLIAVRRDHRLVALSTIA...
1,Acidiphilium_cryptum,rlmE,MTEETIGSRRRAAVRLKAARKHKPSSQKWLLRQLNDPYVAAAKERG...
2,Acidiphilium_cryptum,Acry_0770,MIPRLTDAERAALVDLLPEWSLAKDRDAIERRFAFADFSEAFAFMT...
3,Acidiphilium_cryptum,pth,MKLWVGLGNPEPGMARNRHNIGFMAIDVIADRHGFAPWRKRFSGLV...
4,Acidiphilium_cryptum,rplN,MIIVESNLDVADNSGARRVQCIKVLGGSKRRTASVGDVIVVSIKDA...
...,...,...,...
97726,Sulfobacillus _hermosulfidooxidans,C7B47_13575,MIRRSLLARWMLITSSVVALSLMTWALFSLFNVKTPEPARVMPVAH...
97727,Sulfobacillus _hermosulfidooxidans,mtaD,MRYRIETDAILTMDDEFRVFRPGQLTWEDGTIVSVGPVADDATPVD...
97728,Sulfobacillus _hermosulfidooxidans,crcB alt version 1,MNDIIIFVGGFLGAVARFQVGQWVGQRTSGGFPYGTLVINTLGCLF...
97729,Sulfobacillus _hermosulfidooxidans,C7B47_12330,MIFIFPFRGGTTRIYYDDSQSGSGSMSTHLTVKGTTVTVSPWFNNQ...


In [5]:
len(pd.unique(organism_genes.Organism))

20

In [6]:
conserved_genes = pd.read_csv('Homologous_genes.csv')
print(len(pd.unique(conserved_genes.Organism)))
conserved_genes

20


Unnamed: 0.1,Unnamed: 0,Gene,Organism
0,0,rpoD,acidovorans
1,2,rsfS,acidovorans
2,3,rpmB,acidovorans
3,6,infB,acidovorans
4,9,rpsF,acidovorans
...,...,...,...
3320,166987,map,tumefaciens
3321,167323,aroC,tumefaciens
3322,167950,rsmH,tumefaciens
3323,168394,lepA,tumefaciens


In [7]:
genes = pd.unique(conserved_genes.Gene)
print(len(genes))
for gene in genes:
    df = organism_genes[organism_genes.Gene.str.contains(gene)]
    file_name = str('Gene_FASTA_Files\\' + gene + '.fasta')
    outfile = open(file_name, 'w')
    for row in df.itertuples():
        header = str('> ' + row.Organism + ' ' + row.Gene + '\n')
        outfile.write(header)
        outfile.write(row.Sequence)
        outfile.write('\n')
    outfile.close()              

304


In [8]:
len(conserved_genes[~conserved_genes.Gene.isin(organism_genes.Gene)])

9