Permalink
Browse files

Merge pull request #79 from nextstrain/measles

Include measles build
  • Loading branch information...
trvrb committed Nov 14, 2017
2 parents 7fc3fca + 124821a commit fb79bc99e3ff2ce56f693ab86a8a4df02a7434b4
View
@@ -0,0 +1,15 @@
## Download data from Genbank
* [Genbank search URL](https://www.ncbi.nlm.nih.gov/nuccore?term=measles%5Btitle%5D%20AND%20viruses%5Bfilter%5D%20AND%20%28%225000%22%5BSLEN%5D%20%3A%20%2220000%22%5BSLEN%5D%29&cmd=DetailsSearch)
* This is search fields of `measles[title] AND viruses[filter] AND ("5000"[SLEN] : "20000"[SLEN])`
* Send to : Complete Record : File : Accession List
* This downloads the file `sequence.seq`
* Open this file and remove the `.1`, `.2`, etc... from the accession numbers
## Upload to fauna
`python vdb/measles_upload.py -db vdb -v measles --ftype accession --source genbank --locus genome --fname sequence.seq`
## Download from fauna
`python vdb/measles_download.py -db vdb -v measles --fstem measles --resolve_method choose_genbank`
@@ -0,0 +1,26 @@
label fix
Pennsylvania.USA/20.09 2009-05-14
Virginia.USA/15.09 2009-04-09
New_York.USA/26.09/3 2009-06-25
Florida.USA/19.09 2009-05-07
New_Jersey.USA/45.05 2005-11-05
temara.MOR/24.03 2003-06-11
California.USA/16.03 2003-04-16
California.USA/8.04 2004-02-19
Alabama.USA/13.14/B3 2014-03-26
Arizona.USA/11.08/2 2008-03-11
Calais.FRA/01.16/B3 2016-01-01
California.USA/05.14/B3 2014-01-29
California.USA/08.14/3/B3 2014-02-19
California.USA/16.03 2003-04-16
California.USA/16.12/D4 2012-04-15
California.USA/19.10/D9 2010-05-07
California.USA/47.13/D4 2013-11-19
California.USA/49.10/D8 2010-12-03
California.USA/8.04 2004-02-19
Florida.USA/19.09 2009-05-07
Gambia/1993 1993-XX-XX
Mvs/Toulon.FRA/08.07 2007-02-19
New_Jersey.USA/45.05 2005-11-05
New_York.USA/26.09/3 2009-06-25
Pennsylvania.USA/20.09 2009-05-14
@@ -0,0 +1,4 @@
label fix
DongThap.VNM/06.14/D8 vietnam
DongThap.VNM/08.14/D8 vietnam
Gambia/1993 gambia
@@ -0,0 +1,18 @@
label fix
Measles_virus_strain_Edmonston_(Schwarz_vaccine) Edmonston
Edmonston-Zagreb_vaccine Edmonston
Edmonston_B Edmonston
Measles_virus_strain_Edmonston Edmonston
Measles_virus_strain_Edmonston_(AIK-C_vaccine) Edmonston
Measles_virus_strain_Edmonston_(Moraten_vaccine) Edmonston
Measles_virus_strain_Edmonston_(Rubeovax_vaccine) Edmonston
Measles_virus_strain_Edmonston_(Zagreb_vaccine) Edmonston
Moraten_vaccine Edmonston
Rubeovox_vaccine Edmonston
attenuated_Edmonston_Enders_(Morten) Edmonston
Ichinose-B95a Ichinose
Ichinose-Vero Ichinose
Tokyo.JPN/37.99_Y_C7__MV99YC7_ Tokyo.JPN/37.99/C7
Tokyo.JPN/37.99_Y___MV99Y_ Tokyo.JPN/37.99/C7
T11wild T11
T11Ve-23 T11
View
@@ -0,0 +1,22 @@
import os,datetime
from download import download
from download import get_parser
class measles_download(download):
def __init__(self, **kwargs):
download.__init__(self, **kwargs)
if __name__=="__main__":
parser = get_parser()
args = parser.parse_args()
fasta_fields = ['strain', 'virus', 'accession', 'collection_date', 'region',
'country', 'division', 'location', 'source', 'locus', 'authors',
'url', 'title', 'journal', 'puburl']
args.fasta_fields = fasta_fields
current_date = str(datetime.datetime.strftime(datetime.datetime.now(),'%Y_%m_%d'))
if args.fstem is None:
args.fstem = args.virus + '_' + current_date
if not os.path.isdir(args.path):
os.makedirs(args.path)
connfluVDB = measles_download(**args.__dict__)
connfluVDB.download(**args.__dict__)
View
@@ -0,0 +1,40 @@
import os, re, time, datetime, csv, sys
import rethinkdb as r
from Bio import SeqIO
from upload import upload
from upload import parser
class measles_upload(upload):
def __init__(self, **kwargs):
upload.__init__(self, **kwargs)
self.strain_fix_fname = "source-data/measles_strain_name_fix.tsv"
self.location_fix_fname = "source-data/measles_location_fix.tsv"
self.date_fix_fname = "source-data/measles_date_fix.tsv"
def fix_name(self, name):
original_name = name
print("original name", original_name)
name = self.replace_strain_name(original_name, self.fix_whole_name)
name = name.replace('MVs/', '').replace('MVi/', '').replace('Mvi/', '')
name = re.sub(r'[_ ]?\[([A-Z][0-9])\]$', r'/\1', name)
name = re.sub(r'\(([A-Z][0-9])\)$', r'/\1', name)
name = re.sub(r'_([A-Z][0-9])_$', r'/\1', name)
name = re.sub(r'[ ;]', r'_', name)
name = re.sub(r'//', r'/', name)
name = self.replace_strain_name(name, self.fix_whole_name)
print("fixed name", name)
return name, original_name
def fix_casing(self, document):
for field in ['host']:
if field in document and document[field] is not None:
document[field] = self.camelcase_to_snakecase(document[field])
if __name__=="__main__":
args = parser.parse_args()
virus_fasta_fields = {1:'strain', 2:'collection_date', 3: 'host', 4:'country', 5:'division'}
sequence_fasta_fields = {0:'accession', 1:'strain'}
setattr(args, 'virus_fasta_fields', virus_fasta_fields)
setattr(args, 'sequence_fasta_fields', sequence_fasta_fields)
connVDB = measles_upload(**args.__dict__)
connVDB.upload(**args.__dict__)

0 comments on commit fb79bc9

Please sign in to comment.