In [82]:
import os
import pandas as pd
import subprocess
import Bio
from Bio import Entrez
from Bio import SeqIO

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', -1)

In [75]:
def check_create_dir(path):
    if os.path.exists(path):
        pass
    else:
        os.mkdir(path)

In [119]:
def execute_subprocess(cmd):
    """
    https://crashcourse.housegordon.org/python-subprocess.html
    https://docs.python.org/3/library/subprocess.html 
    Execute and handle errors with subprocess, outputting stderr instead of the subprocess CalledProcessError
    """
    prog = cmd[0]
    param = cmd[1:]
    
    try:
        command = subprocess.run(cmd , stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        if command.returncode == 0:
            print("Program %s successfully executed" % prog)
        else:
            print("Command %s FAILED\n" % prog +
                 "WITH PARAMETERS: " + " ".join(param) + "\n"
                + "EXIT-CODE: %d\n" % command.returncode +
                "ERROR:\n" + command.stderr.decode().strip())
        print(command.stdout)
        print(command.stderr.decode().strip())
    except OSError as e:
        sys.exit("failed to execute program '%s': %s" % (prog, str(e)))

In [104]:
file = '/processing_Data/antibioticos/mperezv/ANALYSIS/KPN_HV/41467_2018_5114_MOESM4_ESM_HV_clean.xlsx'

In [105]:
df = pd.read_excel(file)

In [106]:
df

Unnamed: 0,Genome_name,Strain,Accession_Number,Reference,Country,Year,Source,Sample_type,ST,Sublineage I,ICEKp_structure,ICEKp_site,YBT_lineage,YbST,clb,microcin,Vir_plasmid,iro,iuc,rmpA,rmpA2,AMR genes,Plasmid_replicons,Phage,Phage_length,Missing_cas_genes,Num_CRISPR_arrays,Repeat_num(s),Missing_CRISPR_spacers,Additional_CRISPR_spacers
0,5193_1_4,07-0003m,ERR025468,Holt2015,Vietnam,2007,human,blood (liver abscess),23,yes,ICEKp10,1,1.0,47,1,1*^,1,1,1,1,1,-,"ColF, FIBK",2,34.6;46.3,All,0,na,26,0.0
1,5235_8_2,08-049B,ERR025673,Holt2015,Vietnam,2008,human,sputum (pneumonia),23,yes,ICEKp10,?,1.0,60-4LV,1,1,1,1,1,1,1,-,FIBK,1,39.9,na,2,85,26,0.0
2,ERR562357,SB4467,ERR562357,Bialek2014,France,2011,human,unknown,23,yes,ICEKp10,1,1.0,330,1,1,1,1,1,1,1,-,FIBK,2,33.1;48,"cse1, cse2, cse3, cse4, cas5e, cas1, cas2 (7/7)",0,na,26,0.0
3,ERR713522,CAS691,ERR713522,Struve2015,Canada,2005,human,liver abscess,23,yes,ICEKp10,1,1.0,47,1,1,1,1,1,1,1,-,"FIA, FIIK, FIBK",3,36.4;39.8;47.6,"cse1, cse2, cse3, cse4, cas5e, cas1, cas2 (7/7)",1,8,26,7.0
4,ERR713524,CAS690,ERR713524,Struve2015,Canada,2005,human,liver abscess,23,yes,ICEKp10,1,1.0,47,1,1,1,1,1,1,1,-,"FIAHI1, FIIK, FIBK",3,28.3;34.6;37.8,"cse1, cse2, cse3, cse4, cas5e, cas1, cas2 (7/7)",1,8,26,7.0
5,SB4812,IPEUC650,SRR5713913,This study,France,1980,horse,genital tract,23,yes,ICEKp10,1,1.0,47,1,1,1,1,1,1,1,"aadA1, sul1, aph3''Ia, tetAR","FII, ColRNAI",1,38.8,na,2,94,26,11.0
6,ERR712899,CAS987,ERR712899,Struve2015,Taiwan,1996,human,liver abscess,23,yes,ICEKp10,1,1.0,47,1,1,1,1,1,1,1,-,-,1,43.1,na,2,97,24,12.0
7,ERR560505,SB4384,ERR560505,Bialek2014,France,2008,human,blood,23,yes,ICEKp10,?,1.0,47,1,1^,1,1,1,1,1,-,"HI1B, FIBK",3,39.9;41.2;122.2,na,2,97,23,11.0
8,ERR712793,CAS985,ERR712793,Struve2015,Taiwan,1996,human,liver abscess,23,yes,ICEKp10,1,1.0,335,1,1,1,1,1,1,1,-,FIB,2,38.8;129.5,na,2,98,26,15.0
9,573_1660,BIDMC85,SRS1011229,GenBank,unknown,2013,human,unknown,23,yes,ICEKp10,1,1.0,47,1*,1,1,1,1,1,1,-,"FIB, FIBK",2,34.6;112.7,na,2,116,23,12.0


In [107]:
df[~((df.Accession_Number.str.startswith('SR')) | (df.Accession_Number.str.startswith('ERR')))]

Unnamed: 0,Genome_name,Strain,Accession_Number,Reference,Country,Year,Source,Sample_type,ST,Sublineage I,ICEKp_structure,ICEKp_site,YBT_lineage,YbST,clb,microcin,Vir_plasmid,iro,iuc,rmpA,rmpA2,AMR genes,Plasmid_replicons,Phage,Phage_length,Missing_cas_genes,Num_CRISPR_arrays,Repeat_num(s),Missing_CRISPR_spacers,Additional_CRISPR_spacers
19,ED2,ED2,CP016813.1,GenBank,Taiwan,2006,human,blood,23,yes,ICEKp10,1.0,1.0,336.0,1,1,0,0,0,0,0,-,-,1,34.6,na,2,129,21,14.0
38,1193292_6,1084,CP003785.1,Lin2012,Taiwan,2002,human,blood (liver abscess),23,yes,ICEKp10,1.0,1.0,47.0,1,1,0,0,0,0,0,-,-,1,33.1,na,2,159,19,15.0
67,484021_57,NTUH-K2044,AP006725.1; AP006726.1,Wu2009,Taiwan,1996-2001,human,blood (liver abscess & meningitis),23,no,ICEKp1,1.0,2.0,326.0,0,0,1,1,1,1,1,-,"HI1B, FIBK",1,28.8,na,2,244,na,
69,ED23,ED23,CP016814.1; CP016815.1,GenBank,Taiwan,2006,human,blood,23,no,ICEKp3,3.0,9.0,331.0,0,0,1,1,1,1,1,-,HI1B,3,34.6;57.5;58.3,na,2,235,6,6.0
87,RJF999,RJF999,CP014010.1; CP014011.1,GenBank,China,2015,human,blood,23,yes,ICEKp10,1.0,1.0,338.0,1,1,1,1,1,1,1,-,HI1B,3,34.6;37.9;58.3,na,2,2910,18,29.0
89,NCTC9494,4428-53,NZ_UGMO01000002.1,NCTC,unknown,1954,human,sputum,23,no,no_ybt,,,,0,0,1,1,1,1,1,-,"HI1B, FIBK",3,34.6;39.2;51.4,na,2,337,4,16.0


In [108]:
acc = df.Accession_Number.tolist()

In [109]:
acc_ncbi = [x.split(';')[0] for x in acc if not( x.startswith('ERR') or  x.startswith('SR'))]

In [110]:
print(acc_ncbi)

['CP016813.1', 'CP003785.1', 'AP006725.1', 'CP016814.1', 'CP014010.1', 'NZ_UGMO01000002.1']


In [111]:
acc_plasmid = [x.split(';')[0] for x in acc if ';' in x]

In [112]:
acc_plasmid

['AP006725.1', 'CP016814.1', 'CP014010.1']

In [113]:
acc_srr = [x for x in acc if ( x.startswith('ERR') or  x.startswith('SR'))]

In [114]:
print(acc_srr)

['ERR025468', 'ERR025673', 'ERR562357', 'ERR713522', 'ERR713524', 'SRR5713913', 'ERR712899', 'ERR560505', 'ERR712793', 'SRS1011229', 'ERR713643', 'ERR024843', 'ERR562359', 'ERR699266', 'ERR025111', 'ERR718793', 'ERR562360', 'ERR712794', 'ERR712903', 'ERR560501', 'ERR230425', 'ERR562354', 'ERR712791', 'ERR713523', 'SRR5713914', 'SRR5082361', 'SRR5082378', 'ERR562351', 'ERR699269', 'ERR713521', 'SRR5713920', 'SRR5713919', 'SRR5713911', 'ERR560521', 'ERR560523', 'ERR712901', 'SRR5082364', 'ERR025117', 'ERR025654', 'ERR560506', 'ERR560508', 'ERR562352', 'ERR712624', 'ERR712792', 'ERR712904', 'ERR713641', 'ERR713642', 'SRR5082373', 'SRR5082380', 'SRR5082377', 'SRR5713915', 'SRR5713918', 'SRR5713917', 'SRR5713922', 'SRR5713909', 'SRR5713908', 'SRR5713910', 'SRR5082359', 'SRR5082357', 'ERR257665', 'ERR562350', 'ERR712900', 'SRR5713921', 'ERR1015301', 'ERR1015341', 'ERR712789', 'ERR276927', 'SRS1011222', 'SRR5082328', 'SRR2098776', 'ERR712626', 'ERR025115', 'ERR712623', 'ERR562353', 'ERR706870

# DOWNLOAD NCBI

In [115]:
Entrez.email = "A.N.Other@example.com"

In [116]:
failed_sequences = []
total_sequences = len(acc_ncbi)
current_record = 1
for record_item in acc_ncbi:
    print(record_item)
    check_create_dir('reads/' + record_item)
    out_filename = record_item + ".fasta"
    out_filepath = os.path.join('reads/' + record_item, out_filename)
    try:
        handle = Entrez.efetch(db="nucleotide", id=record_item, rettype="fasta", retmode="text")
        record = SeqIO.read(handle, "fasta")
        print(" %s/%s Downloading record %s" % (current_record,total_sequences, record.id))
        with open(out_filepath, 'w+') as output_handle:
            SeqIO.write(record, output_handle, "fasta")
        handle.close()
    except:
        print(" %s/%s Failed to download %s" % (current_record,total_sequences, record.id))
        failed_sequences.append(record_item)
    current_record = current_record + 1       

CP016813.1
 1/6 Downloading record CP016813.1
CP003785.1
 2/6 Downloading record CP003785.1
AP006725.1
 3/6 Downloading record AP006725.1
CP016814.1
 4/6 Downloading record CP016814.1
CP014010.1
 5/6 Downloading record CP014010.1
NZ_UGMO01000002.1
 6/6 Downloading record NZ_UGMO01000002.1


# DOWNLOAD SRA

In [125]:
for sra_item in acc_srr:
    print(sra_item)
    check_create_dir('reads/' + sra_item)
    out_filepath = os.path.join('reads', sra_item)
    cmd = ['fastq-dump', '--split-files', '--gzip', '--outdir', out_filepath, sra_item]
    #execute_subprocess(cmd)

ERR025468
Program fastq-dump successfully executed
b'Rejected 4320272 READS because READLEN < 1\nRead 4320272 spots for ERR025468\nWritten 4320272 spots for ERR025468\n'

ERR025673
Program fastq-dump successfully executed
b'Rejected 4385820 READS because READLEN < 1\nRead 4385820 spots for ERR025673\nWritten 4385820 spots for ERR025673\n'

ERR562357
Program fastq-dump successfully executed
b'Read 16780610 spots for ERR562357\nWritten 16780610 spots for ERR562357\n'

ERR713522
Command fastq-dump FAILED
WITH PARAMETERS: --split-files --gzip --outdir reads/ERR713522 ERR713522
EXIT-CODE: 3
ERROR:
2020-02-28T13:46:04 fastq-dump.2.8.0 err: no error - error with http open 'https://sra-downloadb.be-md.ncbi.nlm.nih.gov/sos1/sra-pub-run-2/ERR713522/ERR713522.1'
2020-02-28T13:46:04 fastq-dump.2.8.0 err: item not found while constructing within virtual database module - the path 'ERR713522' cannot be opened as database or table
b''
2020-02-28T13:46:04 fastq-dump.2.8.0 err: no error - error with ht

Program fastq-dump successfully executed
b'Read 1517317 spots for ERR713642\nWritten 1517317 spots for ERR713642\n'

SRR5082373
Program fastq-dump successfully executed
b'Read 626148 spots for SRR5082373\nWritten 626148 spots for SRR5082373\n'

SRR5082380
Program fastq-dump successfully executed
b'Read 657267 spots for SRR5082380\nWritten 657267 spots for SRR5082380\n'

SRR5082377
Program fastq-dump successfully executed
b'Read 346109 spots for SRR5082377\nWritten 346109 spots for SRR5082377\n'

SRR5713915
Program fastq-dump successfully executed
b'Read 20958323 spots for SRR5713915\nWritten 20958323 spots for SRR5713915\n'

SRR5713918
Program fastq-dump successfully executed
b'Read 540845 spots for SRR5713918\nWritten 540845 spots for SRR5713918\n'

SRR5713917
Program fastq-dump successfully executed
b'Read 669920 spots for SRR5713917\nWritten 669920 spots for SRR5713917\n'

SRR5713922
Program fastq-dump successfully executed
b'Read 573499 spots for SRR5713922\nWritten 573499 spots fo

# Replace 3 in paired reads

In [130]:
for root, _, files in os.walk('/processing_Data/antibioticos/mperezv/ANALYSIS/KPN_HV/reads'):
        for name in files:
            filename = os.path.join(root, name)
            if "_3" in filename:
                new_filename = os.path.join(root,name.split("_")[0] + "_2.fastq.gz")
                print(filename)
                print(new_filename + "\n")
                os.rename(filename, new_filename)

/processing_Data/antibioticos/mperezv/ANALYSIS/KPN_HV/reads/ERR025468/ERR025468_3.fastq.gz
/processing_Data/antibioticos/mperezv/ANALYSIS/KPN_HV/reads/ERR025468/ERR025468_2.fastq.gz

/processing_Data/antibioticos/mperezv/ANALYSIS/KPN_HV/reads/ERR025673/ERR025673_3.fastq.gz
/processing_Data/antibioticos/mperezv/ANALYSIS/KPN_HV/reads/ERR025673/ERR025673_2.fastq.gz

/processing_Data/antibioticos/mperezv/ANALYSIS/KPN_HV/reads/ERR024843/ERR024843_3.fastq.gz
/processing_Data/antibioticos/mperezv/ANALYSIS/KPN_HV/reads/ERR024843/ERR024843_2.fastq.gz

/processing_Data/antibioticos/mperezv/ANALYSIS/KPN_HV/reads/ERR025111/ERR025111_3.fastq.gz
/processing_Data/antibioticos/mperezv/ANALYSIS/KPN_HV/reads/ERR025111/ERR025111_2.fastq.gz

/processing_Data/antibioticos/mperezv/ANALYSIS/KPN_HV/reads/ERR025117/ERR025117_3.fastq.gz
/processing_Data/antibioticos/mperezv/ANALYSIS/KPN_HV/reads/ERR025117/ERR025117_2.fastq.gz

/processing_Data/antibioticos/mperezv/ANALYSIS/KPN_HV/reads/ERR025654/ERR025654_3.fas

In [131]:
for root, _, files in os.walk('/processing_Data/antibioticos/mperezv/ANALYSIS/KPN_HV/reads'):
        for name in files:
            filename = os.path.join(root, name)
            if "_3" in filename:
                print(filename)