In [1]:
import jdc
import pandas as pd
import ftplib as ftp
from ete3 import NCBITaxa
ncbi = NCBITaxa()

In [12]:
class cd:
    """
    Context manager for changing the current working directory
    """
    def __init__(self, newPath):
        self.newPath = os.path.expanduser(newPath)

    def __enter__(self):
        self.savedPath = os.getcwd()
        os.chdir(self.newPath)

    def __exit__(self, etype, value, traceback):
        os.chdir(self.savedPath)

In [2]:
class taxa_sample(object):
    def __init__(self,
                 assembly_summary='/work/ncbi_dbs/assembly_summary_refseq.txt'):
        
        header = 'assembly_accession bioproject biosample wgs_master refseq_category\
          taxid species_taxid organism_name infraspecific_name isolate version_status\
          assembly_level release_type genome_rep seq_rel_date asm_name submitter\
          gbrs_paired_asm paired_asm_comp ftp_path excluded_from_refseq relation_to_type_material'.split()
        self.assembly_summary = pd.read_table(assembly_summary,
                                         comment='#', header=None, names=header,
                                         dtype={'taxid':str, 'infraspecific_name':str})
        self.assembly_summary['refseq_category']  = self.assembly_summary['refseq_category'].str.lower()
        self.assembly_summary['assembly_level']   = self.assembly_summary['assembly_level'].str.lower()
        self.assembly_summary['genome_rep']       = self.assembly_summary['genome_rep'].str.lower()
        self.assembly_summary.set_index('assembly_accession', inplace=True)
        self.genome_taxids = None

In [37]:
%%add_to taxa_sample
def download_genome(self, accession_numbers, destination_folder='.'):
    ncbi_ftp = ftp.FTP('ftp.ncbi.nlm.nih.gov')
    ncbi_ftp.login()
    succesful_download = []
    for index, row in self.assembly_summary.loc[accession_numbers].iterrows():
        ncbi_ftp.cwd('/')
        path = row['ftp_path'].replace('ftp://ftp.ncbi.nlm.nih.gov/', '')
        try:
            ncbi_ftp.cwd(path)
        except:
            continue
        assembly_files = ncbi_ftp.nlst()
        for assembly_file in assembly_files:
            if assembly_file.endswith('protein.faa.gz'):
                handle = open('%s/%s.gz' % (destination_folder, index), 'wb')
                ncbi_ftp.retrbinary("RETR %s" % assembly_file, handle.write)
                succesful_download.append(index)
    ncbi_ftp.quit()
    return succesful_download

In [4]:
%%add_to taxa_sample
def prune_taxa(self, accession_numbers, assembly_level='contig', genome_rep='partial'):
    tmp_df = self.assembly_summary.loc[accession_numbers,
                                       'assembly_level genome_rep'.split()].copy()
    tmp_df = tmp_df[(tmp_df.assembly_level != assembly_level) &
                    (tmp_df.genome_rep     != genome_rep)]

    return tmp_df.index.tolist()

In [5]:
%%add_to taxa_sample
def genomes_from_taxon(self, target_taxon):
    if not self.genome_taxids:
        self.genome_taxids = ','.join(set(self.assembly_summary.taxid.tolist()))
    query  = ncbi.db.execute("SELECT taxid FROM species WHERE taxid IN (%s) AND \
                             ',' || track || ',' like '%%,%s,%%';" %
                             (self.genome_taxids, target_taxon))
    found_taxids = [str(x[0]) for x in query.fetchall()]
    return self.assembly_summary.index[
        self.assembly_summary.taxid.isin(found_taxids)
    ].tolist()

In [40]:
import os
import shutil
import subprocess
import re

genome_sample = taxa_sample()
%cd /work/index_hgt

/work/index_hgt


  if (yield from self.run_code(code, result)):


In [11]:
os.mkdir('taxa')
taxa = dict(thermoplasmatales='2301', methanomassiliicocales='1235850',
            sulfolobales     ='2281', desulforococales      ='114380',
            thermoproteales  ='2266')

In [64]:
with cd('taxa'):
    for taxon_name, taxid in taxa.items():
        os.mkdir(taxon_name)
        
        genome_accessions  = genome_sample.genomes_from_taxon(taxid)
        pruned_genomes     = genome_sample.prune_taxa(genome_accessions)
        downloaded_genomes = genome_sample.download_genome(pruned_genomes, destination_folder=taxon_name)
        for genome in downloaded_genomes:
            subprocess.call(['gunzip', '%s/%s.gz' % (taxon_name, genome)])
        
            
            fasta = open('%s/%s' % (taxon_name, genome)).readlines()
            out   = open('../genomes/%s.faa' % genome, 'w')
            for line in fasta:
                if line.startswith('>'):
                    sequence_acc = re.search('^>(\S+)', line, re.M).group(1)
                    organism     = genome_sample.assembly_summary.loc[genome,
                                                                      'organism_name']
                    strain = ''
                    if pd.notnull(genome_sample.assembly_summary.loc[genome,
                                                                     'infraspecific_name']):
                        strain  = genome_sample.assembly_summary.loc[genome,
                                                                     'infraspecific_name'].replace('strain=',
                                                                                                   '')
                    if not organism.endswith(strain):
                        organism = '%s %s' % (organism, strain)
                    new_header   = '>%s|%s [%s]\n' % (sequence_acc, genome, organism)
                    
                    out.write(new_header)
                else:
                    out.write(line)
            out.close()