Skip to content

Commit

Permalink
update RefSeq db build scripts
Browse files Browse the repository at this point in the history
  • Loading branch information
oschwengers committed Oct 26, 2021
1 parent a6a5f90 commit d662dc5
Show file tree
Hide file tree
Showing 2 changed files with 31 additions and 11 deletions.
40 changes: 30 additions & 10 deletions db-scripts/build-db-refseq.nf
Original file line number Diff line number Diff line change
Expand Up @@ -15,26 +15,47 @@ Channel.fromPath( assemblySummary )
def strain = it[8] - 'strain='
def status = it[11].split(' ')[0].toLowerCase()
if( species.contains( strain ) )
return [ it[0], it[5], species, status, it[19] - 'ftp://ftp.ncbi.nlm.nih.gov/genomes/' ]
return [ it[0], it[5], species, status, it[19] - 'https://ftp.ncbi.nlm.nih.gov/genomes/' ]
else
return [ it[0], it[5], "${species} ${strain}", status, it[19] - 'ftp://ftp.ncbi.nlm.nih.gov/genomes/' ]
return [ it[0], it[5], "${species} ${strain}", status, it[19] - 'https://ftp.ncbi.nlm.nih.gov/genomes/' ]
} )
.set { validGenomes }
.set { chValidGenomes }


process download {

tag { "${acc} - ${orgName}" }

maxForks 5
errorStrategy 'ignore'
maxRetries 3

input:
tuple val(acc), val(taxId), val(orgName), val(status), val(path) from chValidGenomes

output:
tuple val(acc), val(taxId), val(orgName), val(status), path("${acc}.gz") into chDownloadedGenomes

script:
"""
wget -O ${acc}.gz ${ncbiPath}/${path}/${path.split('/').last()}_genomic.fna.gz
"""
}


process sketch {

tag { "${acc} - ${orgName}" }

maxForks 4
errorStrategy 'ignore'
maxRetries 3
conda 'mash=2.3'

input:
set val(acc), val(taxId), val(orgName), val(status), val(path) from validGenomes
tuple val(acc), val(taxId), val(orgName), val(status), path("${acc}.gz") from chDownloadedGenomes

output:
set val(acc), val(taxId), val(status), val(orgName) into dbEntries
tuple val(acc), val(taxId), val(status), val(orgName) into chDbEntries
file("${acc}.msh") into outMash
file("${acc}.fna.gz") into outFasta

Expand All @@ -43,14 +64,13 @@ process sketch {

script:
"""
wget -O ${acc}.gz ${ncbiPath}/${path}/${path.split('/').last()}_genomic.fna.gz
gunzip ${acc}.gz
${REFERENCE_SEEKER_HOME}/share/mash sketch -k 32 -s 10000 ${acc}
gunzip -c ${acc}.gz > ${acc}
mash sketch -k 32 -s 10000 ${acc}
mv ${acc} ${acc}.fna
gzip ${acc}.fna
"""
}


dbEntries.map { "${it[0]}\t${it[1]}\t${it[2]}\t${it[3]}" }
chDbEntries.map { "${it[0]}\t${it[1]}\t${it[2]}\t${it[3]}" }
.collectFile( name: 'db.tsv', storeDir: "./${domain}-refseq/", newLine: true )
2 changes: 1 addition & 1 deletion db-scripts/build-db-refseq.sh
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@

NCBI_PATH='ftp://ftp.ncbi.nlm.nih.gov/genomes'
NCBI_PATH='https://ftp.ncbi.nlm.nih.gov/genomes'

DOMAIN=''
getopts "bavfp" opt;
Expand Down

0 comments on commit d662dc5

Please sign in to comment.