In [1]:
import os
import sys
import re
import pandas as pd
import Bio
from Bio import Entrez
from Bio import SeqIO
from ftplib import FTP

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', -1)

In [2]:
#INPUTS
folder='/processing_Data/antibioticos/mperezv/ANALYSIS/ST307_publicacion/mash'

In [3]:
#Test inputs
acc='NZ_CP026495.1'
screen_file='/processing_Data/antibioticos/mperezv/ANALYSIS/ST307_publicacion/mash/ERR1218732/ERR1218732.screen.tab'

In [4]:
def acc_to_record(accession_number):
    Entrez.email = "A.N.Other@example.com"
    try:
        handle = Entrez.efetch(db="nucleotide", id=accession_number, rettype="fasta", retmode="text")
        record = SeqIO.read(handle, "fasta")
        handle.close()
        print("Downloaded: " + record.description)
        print("Downloaded: " + str(len(record)))
        return record
    except:
        print(record.id + " failed to download")
        sys.exit(1)
    
    
    '''    
    if out_filename == False:
        out_filename = record.id + ".fasta"
    else:
        out_filename = out_filename + ".fasta"
        
        
    #SeqIO.write(record, output_handle, "fasta")
    '''

In [5]:
record = acc_to_record(acc)

Downloaded: NZ_CP026495.1 Klebsiella pneumoniae strain 616 chromosome, complete genome
Downloaded: 5246307


In [6]:
record.id

'NZ_CP026495.1'

In [40]:
def return_best_complete_match(screen_file):
    df = pd.read_csv(screen_file, sep='\t', names=['identity', 'shared-hashes', 'median-multiplicity', 'p-value', 'query-ID', 'query-comment'])
    df_complete = df[df['query-comment'].str.contains('complete genome')]
    df_complete = df_complete[~df_complete['query-comment'].str.contains(' phage')]
    df_complete = df_complete[~df_complete['query-comment'].str.contains(' Phage')]
    df_complete = df_complete[~df_complete['query-comment'].str.contains('shotgun')]
    df_complete = df_complete.sort_values(by=['identity'], ascending=False)
    df_complete.reset_index(inplace=True)
    return df_complete.iloc[0][['query-ID','query-comment']].tolist()

In [26]:
def return_best_match(screen_file):
    df = pd.read_csv(screen_file, sep='\t', names=['identity', 'shared-hashes', 'median-multiplicity', 'p-value', 'query-ID', 'query-comment'])
    #df_complete = df[df['query-comment'].str.contains('complete genome')]
    df_complete = df.sort_values(by=['identity'], ascending=False)
    df_complete.reset_index(inplace=True)
    return df_complete.iloc[0][['query-ID','query-comment']].tolist()

In [27]:
dfcomplete = return_best_complete_match(screen_file)

In [28]:
dfcomplete

['GCF_003076555.1_ASM307655v1_genomic.fna.gz',
 '[3 seqs] NZ_CP026495.1 Klebsiella pneumoniae strain 616 chromosome, complete genome [...]']

In [29]:
dfb = return_best_match(screen_file)

In [30]:
dfb

['GCF_900092885.1_18090_8_42_genomic.fna.gz',
 '[30 seqs] NZ_FLWY01000029.1 Klebsiella pneumoniae strain PB107, whole genome shotgun sequence [...]']

In [31]:
%%time

dfrefseq = pd.read_csv('ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/bacteria/assembly_summary.txt', skiprows=0, sep='\t', header=1)


CPU times: user 1.3 s, sys: 195 ms, total: 1.49 s
Wall time: 8.29 s


In [32]:
dfrefseq.shape

(181083, 22)

In [33]:
dfrefseq.head(4)

Unnamed: 0,# assembly_accession,bioproject,biosample,wgs_master,refseq_category,taxid,species_taxid,organism_name,infraspecific_name,isolate,version_status,assembly_level,release_type,genome_rep,seq_rel_date,asm_name,submitter,gbrs_paired_asm,paired_asm_comp,ftp_path,excluded_from_refseq,relation_to_type_material
0,GCF_000010525.1,PRJNA224116,SAMD00060925,,representative genome,438753,7,Azorhizobium caulinodans ORS 571,strain=ORS 571,,latest,Complete Genome,Major,Full,2007/10/16,ASM1052v1,University of Tokyo,GCA_000010525.1,identical,ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/010/525/GCF_000010525.1_ASM1052v1,,assembly from type material
1,GCF_000007365.1,PRJNA224116,SAMN02604269,,representative genome,198804,9,Buchnera aphidicola str. Sg (Schizaphis graminum),strain=Sg,,latest,Complete Genome,Major,Full,2002/07/02,ASM736v1,Uppsala Univ.,GCA_000007365.1,identical,ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/007/365/GCF_000007365.1_ASM736v1,,
2,GCF_000007725.1,PRJNA224116,SAMN02604289,,representative genome,224915,9,Buchnera aphidicola str. Bp (Baizongia pistaciae),strain=Bp (Baizongia pistaciae),,latest,Complete Genome,Major,Full,2003/01/29,ASM772v1,Valencia Univ.,GCA_000007725.1,identical,ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/007/725/GCF_000007725.1_ASM772v1,,
3,GCF_000009605.1,PRJNA57805,SAMD00061095,,reference genome,107806,9,Buchnera aphidicola str. APS (Acyrthosiphon pisum),strain=APS,Tokyo1998,latest,Complete Genome,Major,Full,2004/05/11,ASM960v1,Rikken GSC,GCA_000009605.1,identical,ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/009/605/GCF_000009605.1_ASM960v1,,


In [34]:
ftp = dfrefseq['ftp_path'][dfrefseq['# assembly_accession'] =='GCF_003076555.1']
ftp

46847    ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/003/076/555/GCF_003076555.1_ASM307655v1
Name: ftp_path, dtype: object

In [35]:
ftp.values[0].split('.gov')[-1]

'/genomes/all/GCF/003/076/555/GCF_003076555.1_ASM307655v1'

In [36]:
def gcf_to_ftp_path(gcf_value):
    try:
        dfrefseq = pd.read_csv('ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/bacteria/assembly_summary.txt', skiprows=0, sep='\t', header=1)
    except:
        print('There was a problem obtaining assembly_summary.txt\n \
        Check: ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/bacteria/assembly_summary.txt')
        sys.exit(1)
    ftp_path = dfrefseq['ftp_path'][dfrefseq['# assembly_accession'] == gcf_value]
    ftp_path_no_domain = ftp_path.values[0].split('.gov')[-1]
    
    return ftp_path_no_domain


In [37]:
gcf_to_ftp_path('GCF_003076555.1')

'/genomes/all/GCF/003/076/555/GCF_003076555.1_ASM307655v1'

In [54]:
def find_common_reference(folder):
    '''
    Dependencies:   -input folder
                    -return_best_complete_match(filename)
                    -return_best_match(filename)
    '''
    #create empty df
    counter_record_complete = pd.DataFrame(columns=['query-ID','query-comment'])
    counter_record_all = pd.DataFrame(columns=['query-ID','query-comment'])
    
    #Find common reference from mash result
    for root, _, files in os.walk(folder):
        for name in files:
            if name.endswith("screen.tab"):
                filename = os.path.join(root, name)
                best_match_complete = return_best_complete_match(filename)
                counter_record_complete.loc[len(counter_record_complete)] = best_match_complete
                
                best_match = return_best_match(filename)
                counter_record_all.loc[len(counter_record_all)] = best_match
    #pd.rename_axis and reset_index turn count_values() into a dataframe
    #https://stackoverflow.com/questions/47136436/python-pandas-convert-value-counts-output-to-dataframe
    #df = value_counts.rename_axis('unique_values').to_frame('counts')
    #counter_comment = counter_record['query-comment'].value_counts().rename_axis('unique_values').reset_index(name='counts')
    #counter_id = counter_record['query-ID'].value_counts().rename_axis('unique_values').reset_index(name='counts')
    counter_df_complete = counter_record_complete.groupby(['query-comment', 'query-ID']).size().reset_index(name='counts')\
    .sort_values(by=['counts'], ascending=False).reset_index(drop = True)
    
    counter_df = counter_record_all.groupby(['query-comment', 'query-ID']).size().reset_index(name='counts')\
    .sort_values(by=['counts'], ascending=False).reset_index(drop = True)
    
    gfc_complete = ('_').join(counter_df_complete.iloc[0]['query-ID'].split('_')[0:2])
    gfc_description = re.sub(r'[ ]?\[.{1,9}\][ ]?','',counter_df_complete.iloc[0]['query-comment'])
        
    #counter_values.to_csv(output_raw_tab, sep='\t', index=False)
    return gfc_complete,gfc_description

In [55]:
matchdf = find_common_reference(folder)

In [56]:
matchdf

('GCF_003076555.1',
 'NZ_CP026495.1 Klebsiella pneumoniae strain 616 chromosome, complete genome')

In [53]:
re.sub(r'[ ]?\[.{1,9}\][ ]?','','[3 seqs] NC_008782.1 Acidovorax sp. JS42, complete genome [...]')

'NC_008782.1 Acidovorax sp. JS42, complete genome'

In [90]:
('_').join(matchdf.iloc[0]['query-ID'].split('_')[0:2])

'GCF_003076555.1'

In [82]:
find_common_reference('/processing_Data/antibioticos/mperezv/ANALYSIS/KPNCARSUR/10-mash')

Unnamed: 0,query-comment,query-ID,counts
0,"[2 seqs] NZ_CP010226.1 Escherichia coli strain S1, complete genome [...]",GCF_001901315.1_ASM190131v1_genomic.fna.gz,1
1,"[3 seqs] NZ_CP026495.1 Klebsiella pneumoniae strain 616 chromosome, complete genome [...]",GCF_003076555.1_ASM307655v1_genomic.fna.gz,1


In [94]:
find_common_reference('/processing_Data/antibioticos/mperezv/ANALYSIS/ECOCARSUR/mash')

GCF_002012205.1


Unnamed: 0,query-comment,query-ID,counts
0,"[4 seqs] NZ_CP018983.1 Escherichia coli strain Ecol_867 chromosome, complete genome [...]",GCF_002012205.1_ASM201220v1_genomic.fna.gz,4
1,"NC_020518.1 Escherichia coli str. K-12 substr. MDS42 DNA, complete genome",GCF_000350185.1_ASM35018v1_genomic.fna.gz,2
2,"NZ_CP007592.1 Escherichia coli O157:H16 strain Santai, complete genome",GCF_000827105.1_ASM82710v1_genomic.fna.gz,1
3,"NZ_CP016182.2 Escherichia coli strain EC590 chromosome, complete genome",GCF_001682305.2_ASM168230v2_genomic.fna.gz,1
4,"[2 seqs] NZ_CP010226.1 Escherichia coli strain S1, complete genome [...]",GCF_001901315.1_ASM190131v1_genomic.fna.gz,1
5,"[3 seqs] NZ_CP010140.1 Escherichia coli strain D3, complete genome [...]",GCF_001900635.1_ASM190063v1_genomic.fna.gz,1
6,"[3 seqs] NZ_CP018965.1 Escherichia coli strain Ecol_517 chromosome, complete genome [...]",GCF_002012005.1_ASM201200v1_genomic.fna.gz,1
7,"[4 seqs] NZ_CP014111.1 Escherichia coli strain FDAARGOS_144 chromosome, complete genome [...]",GCF_002944935.1_ASM294493v1_genomic.fna.gz,1
8,"[4 seqs] NZ_CP023353.1 Escherichia coli strain 746 chromosome, complete genome [...]",GCF_002310655.1_ASM231065v1_genomic.fna.gz,1
9,"[4 seqs] NZ_CP027126.1 Escherichia coli strain AR_0374 chromosome, complete genome [...]",GCF_002999075.1_ASM299907v1_genomic.fna.gz,1


In [8]:
def download_gcf (ftp_address, output_dir, all_data=False ):
    
    output_dir = os.path.abspath(output_dir)
    
    if not os.path.exists(output_dir):
        print("path " + output_dir + " doesn't exist and will be created")
        try:
            os.mkdir(output_dir)
        except:
            print("Folder " + output_dir + "can't be created")
            sys.exit(1)
    
    ftp = FTP('ftp.ncbi.nlm.nih.gov')
    ftp.login()
    
    # Get All Files
    ftp.cwd(ftp_address)
    files = ftp.nlst()
    ftp_folder = ftp_address.split('/')[-1]
    assembly_file = ftp_folder + '_genomic.fna.gz'
    
    # Print out the files
    if all_data == False:
        for file in files:
            if file == assembly_file:
                local_path = os.path.join(output_dir, file)
                with open(local_path, 'wb') as f:
                    print("Downloading.." + file)
                    #ftp.retrbinary("RETR " + file ,open(output_dir + file, 'wb').write)
                    ftp.retrbinary('RETR ' + file, f.write)
    else:
        for file in files:
            local_path = os.path.join(output_dir, file)
            with open(local_path, 'wb') as f:
                print("Downloading.." + file)
                #ftp.retrbinary("RETR " + file ,open(output_dir + file, 'wb').write)
                ftp.retrbinary('RETR ' + file, f.write)

    ftp.close()

In [10]:
download_gcf('genomes/all/GCF/003/076/555/GCF_003076555.1_ASM307655v1', '/processing_Data/antibioticos/mperezv/ANALYSIS/ST307_publicacion/REFERENCESII/', False)

Downloading..GCF_003076555.1_ASM307655v1_genomic.fna.gz
