In [1]:
import urllib.request
import os
import gzip
import shutil
import glob

In [2]:
def readGenome(filename):
    """
    readGenome is a function that opens a Fasta format file available in the wd and reads the sequences on it.
    :param filename: name of the file on the wd.
    :return: a dictionary with genome name as key and the sequence as keyvalue.
    """
    # dna_sequences_by_name_dict is a dictionary with keys: genome name, values: sequence.
    dna_sequences_by_name_dict = dict()
    # Opens the file listed in filenames.
    with open(filename, 'r') as f:
        # for each line on the file:
        for line in f:
            # removes the entry key.
            line = line.strip()
            # if the line starts with > the key value will be empty, while the key will take the value of the line.
            if line[0] == '>':
                genome = ''
                sequence_name = line
            # if the line do not starts with > the keyvalue will take the line value and will join all of the next strings until next >.
            if line[0] != '>':
                genome += line
            dna_sequences_by_name_dict[sequence_name] = genome
    # returns the dictionary with all the read sequences of the file.
    return dna_sequences_by_name_dict

In [3]:
genomes = [
    'chlamydophila_pneumoniae', 
    'thermosynechococcus_elongatus', 
    'bacillus_subtilis', 
    'legionella_pneumophila', 
    'haemophilus_influenzae'
]

urls = ['ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/008/745/GCF_000008745.1_ASM874v1/GCF_000008745.1_ASM874v1_genomic.fna.gz',
       'ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/011/345/GCF_000011345.1_ASM1134v1/GCF_000011345.1_ASM1134v1_genomic.fna.gz',
       'ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/009/045/GCF_000009045.1_ASM904v1/GCF_000009045.1_ASM904v1_genomic.fna.gz',
       'ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/008/485/GCF_000008485.1_ASM848v1/GCF_000008485.1_ASM848v1_genomic.fna.gz',
       'ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/027/305/GCF_000027305.1_ASM2730v1/GCF_000027305.1_ASM2730v1_genomic.fna.gz']

url_by_genome_name = dict(zip(genomes, urls))

In [10]:
filenames = [
    'bacillus_subtilis.fna',
    'chlamydophila_pneumoniae.fna',
    'Downloading_Genome.ipynb',
    'haemophilus_influenzae.fna',
    'legionella_pneumophila.fna',
    'thermosynechococcus_elongatus.fna'
]

In [4]:
extensions = ("*.fna")
filenames_wd = []
for files in extensions:
    filenames_wd.extend(glob.glob(files))                             ## Extend, appends to the list the file names on the directory.
    #filenames = [f for f in filenames if f.endswith("*.fna")] ## Only input filenames. 

In [6]:
filenames  =[]

for idx, genome in enumerate(genomes):
     # downloaded filename.
    out_filename = genome +'.fna.gz'
    # append all out_filenames in the filename list.
    filenames.append(out_filename)
    
    url = urls[idx]
    #urlretrieve, from the url in list urls, and name it with the out_filename.
    urllib.request.urlretrieve(url, out_filename)
        
for files in filenames:
    #read input file.
    with gzip.open(files, 'rb') as f_in: 
        #define new name with extension .fna
        files_out = files[:-3]
        #write file.
        with open(files_out, 'wb') as f_out:
            #directory, file operation. copy from f_in to f_out
            shutil.copyfileobj(f_in, f_out)
    #Remove original .fna.gz file
    os.remove(files)
    


In [None]:
filenames = [
    'bacillus_subtilis.fna',
    'chlamydophila_pneumoniae.fna',
    'Downloading_Genome.ipynb',
    'haemophilus_influenzae.fna',
    'legionella_pneumophila.fna',
    'thermosynechococcus_elongatus.fna'
]

if __name__ == '__main__':
    for filename in filenames:
        try:
            translate_seq = readGenome(filename)
            print(filename)
            for sequence_name, dna_sequence in translate_seq.items():
                #print(sequence_name)
                print(len(dna_sequence))
                print(dna_sequence[:15])
        except Exception as e:
            print(filename)
            print(e)