In [1]:
import urllib.request
import os
import gzip
import shutil
import sys

In [2]:
def readGenome(filename):
    """
    readGenome is a function that opens a Fasta format file available in the wd and reads the sequences on it.
    :param filename: name of the file on the wd.
    :return: a dictionary with genome name as key and the sequence as keyvalue.
    """
    # dna_sequences_by_name_dict is a dictionary with keys: genome name, values: sequence.
    dna_sequences_by_name_dict = dict()
    # Opens the file listed in filenames.
    with open(filename, 'r') as f:
        # for each line on the file:
        for line in f:
            # removes the entry key.
            line = line.strip()
            # if the line starts with > the key value will be empty, while the key will take the value of the line.
            if line[0] == '>':
                genome = ''
                sequence_name = line
            # if the line do not starts with > the keyvalue will take the line value and will join all of the next strings until next >.
            if line[0] != '>':
                genome += line
            dna_sequences_by_name_dict[sequence_name] = genome
    # returns the dictionary with all the read sequences of the file.
    return dna_sequences_by_name_dict

## 1. Downloading the files:

In [3]:
#list with genome sequence ids.
genomes = [
    'chlamydophila_pneumoniae', 
    'thermosynechococcus_elongatus', 
    'bacillus_subtilis', 
    'legionella_pneumophila', 
    'haemophilus_influenzae'
]

# list with download url. Indexed according with list : genomes.
urls = ['ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/008/745/GCF_000008745.1_ASM874v1/GCF_000008745.1_ASM874v1_genomic.fna.gz',
       'ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/011/345/GCF_000011345.1_ASM1134v1/GCF_000011345.1_ASM1134v1_genomic.fna.gz',
       'ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/009/045/GCF_000009045.1_ASM904v1/GCF_000009045.1_ASM904v1_genomic.fna.gz',
       'ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/008/485/GCF_000008485.1_ASM848v1/GCF_000008485.1_ASM848v1_genomic.fna.gz',
       'ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/027/305/GCF_000027305.1_ASM2730v1/GCF_000027305.1_ASM2730v1_genomic.fna.gz']

# Create a dictionary, keys = genome sequence id, keyvalue = genome sequence
url_by_genome_name = dict(zip(genomes, urls))

In [4]:
filenames  =[]

# for each indexed genome in genomes list:
for idx, genome in enumerate(genomes):
    # out_filename will be the downloaded filename.
    out_filename = genome +'.fna.gz'
    # append all out_filenames on the filenames list.
    filenames.append(out_filename)
    
    # List urls is indexed such as genomes list, therefore each url will correspond to an index.
    url = urls[idx]
    #urlretrieve, from the url in list urls, and name it with the out_filename.
    urllib.request.urlretrieve(url, out_filename)
        
for files in filenames:
    #read input file as f_in.
    with gzip.open(files, 'rb') as f_in: 
        #define new name with extension .fna
        files_out = files[:-3]
        #write file as f_out.
        with open(files_out, 'wb') as f_out:
            #directory, file operation. copy from f_in to f_out
            shutil.copyfileobj(f_in, f_out)
    #os.remove() function removes the original .fna.gz file
    os.remove(files)
    
