In [1]:
from Bio import SeqIO
import glob
import re
import pandas as pd

In [2]:
files = glob.glob('Data/Organism_FASTA_Files/*.fasta')
print(len(files))
files

20


['Data/Organism_FASTA_Files\\Acidiphilium_cryptum.fasta',
 'Data/Organism_FASTA_Files\\Bacillus_cereus.fasta',
 'Data/Organism_FASTA_Files\\Bacillus_subtilis.fasta',
 'Data/Organism_FASTA_Files\\Bacteroides_fragilis.fasta',
 'Data/Organism_FASTA_Files\\Bacteroides_thetaiotaomicron.fasta',
 'Data/Organism_FASTA_Files\\Cellulophaga_baltica.fasta',
 'Data/Organism_FASTA_Files\\Cellvibrio_gilvus.fasta',
 'Data/Organism_FASTA_Files\\Chryseobacterium_indologenes.fasta',
 'Data/Organism_FASTA_Files\\Citrobacter_freundii.fasta',
 'Data/Organism_FASTA_Files\\Coprococcus_comes.fasta',
 'Data/Organism_FASTA_Files\\Cupriavidus_necator.fasta',
 'Data/Organism_FASTA_Files\\Delftia_acidovorans.fasta',
 'Data/Organism_FASTA_Files\\Mycobacterium_smegmatis.fasta',
 'Data/Organism_FASTA_Files\\Paracoccus_denitrificans.fasta',
 'Data/Organism_FASTA_Files\\Rhizobium_radiobacter.fasta',
 'Data/Organism_FASTA_Files\\Rhodopseudomonas_palustris.fasta',
 'Data/Organism_FASTA_Files\\Ruminococcus_gnavus.fasta',
 

In [3]:
outfile = open('Data/organism_genes.csv', 'w')
outfile.write('Organism,Gene,Sequence\n')
for file in files:
    organism = re.search('[A-Za-z]+_[a-z]+', file)
    organism = organism[0]
    print(organism)
    genes = []
    for seq_record in SeqIO.parse(file, 'fasta'):
        description = seq_record.description
        gene_name = re.search('(?<=GN=)\S*', description)
        if gene_name:
            gene_name = gene_name.group() 
            outfile.write(str(organism + ',' + gene_name + ',' + seq_record.seq +'\n'))
outfile.close()        

Acidiphilium_cryptum
Bacillus_cereus
Bacillus_subtilis
Bacteroides_fragilis
Bacteroides_thetaiotaomicron
Cellulophaga_baltica
Cellvibrio_gilvus
Chryseobacterium_indologenes
Citrobacter_freundii
Coprococcus_comes
Cupriavidus_necator
Delftia_acidovorans
Mycobacterium_smegmatis
Paracoccus_denitrificans
Rhizobium_radiobacter
Rhodopseudomonas_palustris
Ruminococcus_gnavus
Stigmatella_aurantiaca
Streptomyces_griseorubens
Sulfobacillus_thermosulfidooxidans


In [4]:
organism_genes = pd.read_csv('Data/organism_genes.csv')
organism_genes

Unnamed: 0,Organism,Gene,Sequence
0,Acidiphilium_cryptum,nuoN2,MTAPAFTAKAFAAFAPFTLLGAVTILVMLLIAVRRDHRLVALSTIA...
1,Acidiphilium_cryptum,rlmE,MTEETIGSRRRAAVRLKAARKHKPSSQKWLLRQLNDPYVAAAKERG...
2,Acidiphilium_cryptum,Acry_0770,MIPRLTDAERAALVDLLPEWSLAKDRDAIERRFAFADFSEAFAFMT...
3,Acidiphilium_cryptum,pth,MKLWVGLGNPEPGMARNRHNIGFMAIDVIADRHGFAPWRKRFSGLV...
4,Acidiphilium_cryptum,rplN,MIIVESNLDVADNSGARRVQCIKVLGGSKRRTASVGDVIVVSIKDA...
...,...,...,...
97726,Sulfobacillus_thermosulfidooxidans,C7B47_13575,MIRRSLLARWMLITSSVVALSLMTWALFSLFNVKTPEPARVMPVAH...
97727,Sulfobacillus_thermosulfidooxidans,mtaD,MRYRIETDAILTMDDEFRVFRPGQLTWEDGTIVSVGPVADDATPVD...
97728,Sulfobacillus_thermosulfidooxidans,crcB,MNDIIIFVGGFLGAVARFQVGQWVGQRTSGGFPYGTLVINTLGCLF...
97729,Sulfobacillus_thermosulfidooxidans,C7B47_12330,MIFIFPFRGGTTRIYYDDSQSGSGSMSTHLTVKGTTVTVSPWFNNQ...


In [5]:
len(pd.unique(organism_genes.Organism))

20

In [6]:
conserved_genes = pd.read_csv('Data/Homologous_genes.csv')
conserved_genes

Unnamed: 0.1,Unnamed: 0,Gene,Organism
0,0,secB,Acidiphilium_cryptum
1,1,prfA,Acidiphilium_cryptum
2,5,rplB,Acidiphilium_cryptum
3,6,prfB,Acidiphilium_cryptum
4,12,rpmD,Acidiphilium_cryptum
...,...,...,...
2832,102148,rplR,Sulfobacillus_thermosulfidooxidans
2833,102470,hisD,Sulfobacillus_thermosulfidooxidans
2834,102526,fabD,Sulfobacillus_thermosulfidooxidans
2835,102570,folD,Sulfobacillus_thermosulfidooxidans


In [7]:
genes = pd.unique(conserved_genes.Gene)
print(len(genes))
headers = dict()
for gene in genes:
    df = organism_genes[organism_genes.Gene.str.contains(gene)]
    file_name = str('Data\\Gene_FASTA_Files\\' + gene + '.fasta')
    outfile = open(file_name, 'w')
    for row in df.itertuples():
        header = str('> ' + row.Organism + '(' + gene + ')')
        if header in headers:
            headers[header] += 1
            header += '[alt v ' + str(headers[header]) + ']\n' 
        else:
            headers[header] = 0
            header += '\n'
        outfile.write(header)
        outfile.write(row.Sequence)
        outfile.write('\n')
    outfile.close()              

317


In [8]:
len(conserved_genes[~conserved_genes.Gene.isin(organism_genes.Gene)])

7