-
Notifications
You must be signed in to change notification settings - Fork 13
/
avian_flu_upload.py
executable file
·692 lines (643 loc) · 38.1 KB
/
avian_flu_upload.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
import os, re, time, datetime, csv, sys, json, math
import numpy as np
import pandas as pd
from rethinkdb import r
from Bio import SeqIO
from Bio import AlignIO
from upload import upload
from upload import get_parser
from unidecode import unidecode
parser = get_parser()
parser.add_argument('--data_source', default='gisaid', type=str, choices=['gisaid', 'ird'], help='data source, either gisaid or ird')
parser.add_argument('--upload_directory', default=False, action="store_true", help='upload all xls and fasta files in directory')
parser.add_argument('--vtype', default=None, help="type of virus, if applicable")
parser.add_argument('--subtype', default=None, help="subtype of virus")
parser.add_argument('--lineage', default=None, help="lineage of virus")
class flu_upload(upload):
def __init__(self, **kwargs):
upload.__init__(self, **kwargs)
self.grouping_upload_fields = ['vtype', 'subtype', 'lineage']
# patterns from the subtype and lineage fields in the GISAID fasta file
self.patterns = {('a / h1n1', 'pdm09'): ('a', 'h1n1', 'seasonal_h1n1pdm'),
('a / h1n2', ''): ('a', 'h1n2', None),
('a / h1n2', 'seasonal'): ('a', 'h1n2', 'seasonal_h1n2'),
('a / h2n2', ''): ('a', 'h2n2', None),
('a / h3n2', ''): ('a', 'h3n2', 'seasonal_h3n2'),
('a / h3n2', 'seasonal'): ('a', 'h3n2', 'seasonal_h3n2'),
('a / h3n3', ''): ('a', 'h3n3', None),
('a / h5n1', ''): ('a', 'h5n1', None),
('a / h5n2', ''): ('a', 'h5n2', None),
('a / h5n3', ''): ('a', 'h5n3', None),
('a / h5n4', ''): ('a', 'h5n4', None),
('a / h5n5', ''): ('a', 'h5n5', None),
('a / h5n6', ''): ('a', 'h5n6', None),
('a / h5n7', ''): ('a', 'h5n7', None),
('a / h5n8', ''): ('a', 'h5n8', None),
('a / h5n9', ''): ('a', 'h5n9', None),
('a / h5n6', ''): ('a', 'h5n6', None),
('a / h6n1', ''): ('a', 'h6n1', None),
('a / h7n1', ''): ('a', 'h7n1', None),
('a / h7n2', ''): ('a', 'h7n2', None),
('a / h7n3', ''): ('a', 'h7n3', None),
('a / h7n7', ''): ('a', 'h7n7', None),
('a / h7n9', ''): ('a', 'h7n9', None),
('a / h9n2', ''): ('a', 'h9n2', None),
('a / h10n7', ''): ('a', 'h10n7', None),
('a / h10n8', ''): ('a', 'h10n8', None),
('a / h11', ''): ('a', 'h11', None),
('b / h0n0', 'victoria'): ('b', None, 'seasonal_vic'),
('b / h0n0', 'yamagata'): ('b', None, 'seasonal_yam'),
('b', 'victoria'): ('b', None, 'seasonal_vic'),
('b', 'yamagata'): ('b', None, 'seasonal_yam'),
('h5n1',''): ('a', 'h5n1', None),
('h7n9',''): ('a', 'h7n9', None),
('h9n2',''): ('a', 'h9n2', None)}
self.outgroups = {lineage: SeqIO.read('source-data/'+lineage+'_outgroup.gb', 'genbank') for lineage in ['H3N2', 'H1N1pdm', 'Vic', 'Yam']}
self.outgroup_patterns = {'H3N2': ('a', 'h3n2', 'seasonal_h3n2'),
'H1N1': ('a', 'h1n1', 'seasonal_h1n1'),
'H1N1pdm': ('a', 'h1n1', 'seasonal_h1n1pdm'),
'Vic': ('b', None, 'seasonal_vic'),
'Yam': ('b', None, 'seasonal_yam')}
self.strain_fix_fname = "source-data/avian_flu_strain_name_fix.tsv"
self.location_fix_fname = "source-data/flu_location_fix.tsv"
self.virus_to_sequence_transfer_fields = ['submission_date']
self.fix = set()
def parse(self, path, fname, data_source, upload_directory, **kwargs):
viruses, sequences = [], []
# data_source comes through **kwargs ie 'gisaid'
if (data_source == 'gisaid'):
if upload_directory:
import glob
for xls_fname, fasta_fname in zip(glob.glob(path + "gisaid*.xls"), glob.glob(path + "gisaid*.fasta")):
parsed = self.parse_files(xls_fname, fasta_fname, **kwargs)
viruses.extend(parsed[0])
sequences.extend(parsed[1])
else:
fasta_fname = path + fname + ".fasta"
xls_fname = path + fname + ".xls"
viruses, sequences = self.parse_files(xls_fname, fasta_fname, **kwargs)
print("Parsed total of " + str(len(viruses)) + " viruses and " + str(len(sequences)) + " sequences from files")
elif (data_source == 'ird'):
print("path + fname", path + fname)
viruses, sequences = self.parse_fasta_file(path + fname, data_source, **kwargs)
print("Parsed " + str(len(viruses)) + " viruses and " + str(len(sequences)) + " sequences from file " + path+fname)
else:
print("Missing data source")
return viruses, sequences
def parse_files(self, xls_fname, fasta_fname, **kwargs):
'''
parse linked xls and fasta downloaded from gisaid
'''
viruses = self.parse_gisaid_xls_file(xls_fname, **kwargs)
sequences = self.parse_fasta_file(fasta_fname, args.data_source, **kwargs)[1]
print("Parsed " + str(len(viruses)) + " viruses and " + str(len(sequences)) + " sequences from files", fasta_fname, xls_fname)
return viruses, sequences
def parse_fasta_file(self, fasta, data_source, **kwargs):
'''
Parse FASTA file with default header formatting
:return: list of documents(dictionaries of attributes) to upload
'''
sequences = []
viruses = []
try:
handle = open(fasta, 'r')
except IOError:
raise Exception(fasta, "not found")
else:
for record in SeqIO.parse(handle, "fasta"):
content = list(map(lambda x: x.strip(), record.description.replace(">", "").split('|')))
s = {key: content[ii] if ii < len(content) else "" for ii, key in sequence_fasta_fields.items()}
s['sequence'] = str(record.seq)
if data_source == 'ird':
convert_segment_to_locus = {"1":"PB2","2":"PB1","3":"PA","4":"HA","5":"NP","6":"NA","7":"MP","8":"NS"}
s['locus'] = convert_segment_to_locus[s['locus']]
s = self.add_sequence_fields(s, **kwargs)
sequences.append(s)
if data_source == 'ird':
v = {key: content[ii] if ii < len(content) else "" for ii, key in virus_fasta_fields.items()}
viruses.append(v)
viruses = [self.add_virus_fields(v, **kwargs) for v in viruses]
handle.close()
return viruses, sequences
def parse_gisaid_xls_file(self, xls, xls_fields_wanted, **kwargs):
'''
parse excel file using pandas
:return: list of documents(dictionaries of attributes) to upload
'''
import pandas
try:
handle = open(xls, 'rb')
except IOError:
raise Exception(xls, "not found")
else:
df = pandas.read_excel(handle, dtype=object) # dtype=object is necessary so that the next line works. otherwise, we will get a mix of Nans and Nones. .where only works on dtype object.
df = df.where((pandas.notnull(df)), None) # convert Nan type to None; Nones are passed to JSON null, but Nans are not, so nans give errors during the upload
viruses = df.to_dict('records')
viruses = [{new_field: v[old_field] if old_field in v else None for new_field, old_field in xls_fields_wanted} for v in viruses]
viruses = [self.add_virus_fields(v, **kwargs) for v in viruses]
return viruses
def format_ird_date(self, virus):
'''
Format viruses date attribute: collection dates from IRD come in MM/DD/YYYY format, for example, 02/28/2016
Reformat the date so that it is readable by format_date in uploady.py
'''
# ex. 2002_04_25 to 2002-04-25
date_fields = []
for f in ['date', 'collection_date', 'submission_date']:
if f in virus:
date_fields.append(f)
for field in date_fields:
if virus[field] is not None and virus[field].strip() != '':
virus[field] = re.sub(r'_', r'-', virus[field])
# ex. XX/XX/2002 or 09/22/2002
if re.match(r'(\d\d|XX)/(\d\d|XX)/\d\d\d\d', virus[field]):
virus[field] = re.sub(r'^(\d\d|XX)/(\d\d|XX)/(\d\d\d\d)$', r'\3-\1-\2', virus[field])
# ex. 9/1/2002
elif re.match(r'(\d|X)/(\d|X)/\d\d\d\d', virus[field]):
virus[field] = re.sub(r'^(\d|X)/(\d|X)/(\d\d\d\d)$', r'\3-0\1-0\2', virus[field])
# ex. 09/1/2002
elif re.match(r'(\d\d|XX)/(\d|X)/\d\d\d\d', virus[field]):
virus[field] = re.sub(r'^(\d\d|XX)/(\d|X)/(\d\d\d\d)$', r'\3-\1-0\2', virus[field])
# ex. 9/01/2002
elif re.match(r'(\d|X)/(\d\d|XX)/\d\d\d\d', virus[field]):
virus[field] = re.sub(r'^(\d|X)/(\d\d|XX)/(\d\d\d\d)$', r'\3-0\1-\2', virus[field])
# ex. 06/2009 (Day unknown)
elif re.match(r'(\d\d/\d\d\d\d)', virus[field]):
virus[field] = re.sub(r'^(\d\d)/(\d\d\d\d)$', r'\2-\1-XX', virus[field])
# ex. 2009 (day and month unknown)
elif re.match(r'\d\d\d\d', virus[field]):
virus[field] = re.sub(r'^(\d\d\d\d)$', r'\1-XX-XX', virus[field])
else:
print("Couldn't reformat this date: " + virus[field] + ", setting to None")
virus[field] = None
else:
virus[field] = None
def format_viruses(self, documents, data_source, **kwargs):
'''
format virus information in preparation to upload to database table
'''
if self.strain_fix_fname is not None:
self.fix_whole_name = self.define_strain_fixes(self.strain_fix_fname)
if self.location_fix_fname is not None:
self.fix_location = self.define_location_fixes(self.location_fix_fname)
self.define_countries("source-data/geo_synonyms.tsv")
self.define_regions("source-data/geo_regions.tsv")
self.define_location_label_fixes("source-data/flu_fix_location_label.tsv")
for doc in documents:
if 'strain' in doc:
doc['strain'], doc['gisaid_strain'] = self.fix_name(doc['strain'])
if data_source == "gisaid":
doc['gisaid_strain'] = doc['gisaid_strain'].replace(" ", "")
else:
print("Missing strain name!")
self.fix_casing(doc, args.data_source)
self.fix_age(doc)
self.format_host(doc)
self.format_domestic_status(doc)
self.format_animal_health_status(doc)
self.format_authors(doc)
self.determine_group_fields(doc, self.patterns)
if args.data_source == 'ird':
self.format_ird_date(doc)
self.format_date(doc)
self.format_country(doc, args.data_source) # first format from strain name
if self.fix_location is not None: # override with fixes
if doc['strain'] in self.fix_location:
doc['location'] = self.fix_location[doc['strain']]
self.format_place(doc, determine_location=True)
self.format_region(doc)
self.rethink_io.check_optional_attributes(doc, [])
def format_sequences(self, documents, **kwargs):
'''
format virus information in preparation to upload to database table
'''
for doc in documents:
if 'strain' in doc:
doc['strain'], doc['gisaid_strain'] = self.fix_name(doc['strain'])
doc['gisaid_strain'] = doc['gisaid_strain'].rstrip("_")
else:
print("Missing strain name!")
self.format_date(doc)
self.format_passage(doc, 'passage', 'passage_category')
self.format_passage(doc, 'virus_strain_passage', 'virus_strain_passage_category') #BP
self.format_passage(doc, 'serum_antigen_passage', 'serum_antigen_passage_category') #BP
self.rethink_io.check_optional_attributes(doc, [])
self.fix_casing(doc, args.data_source)
print("Names that need to be fixed")
for name in sorted(self.fix):
print(name)
def filter(self, documents, index, **kwargs):
'''
filter out certain documents
'''
print(str(len(documents)) + " documents before filtering")
documents = filter(lambda doc: index in doc, documents)
remove_labels = []
# remove certain documents from gisaid files that were not actually isolated from humans
result_documents = [doc for doc in documents if all(label not in doc['strain'] for label in remove_labels)]
#result_documents = [doc for doc in result_documents if self.correct_strain_format(doc['strain'], doc['gisaid_strain'])]
print(str(len(result_documents)) + " documents after filtering")
return result_documents
def correct_strain_format(self, strain, original_strain):
# Okay Patterns: B/Brisbane/46/2015, A/HongKong/1968, A/Zambia/13/176/2013 or A/Cologne/Germany/12/2009 or A/Algeria/G0164/15/2015 or A/India/Delhi/DB106/2009, A/Cameroon/LEID/01/11/1387/2011, A/India/M/Enc/1/2003
if re.match(r'[A|B]/[A-Za-z-]+/([A-Za-z0-9_-]+/)*[0-9]{4}$', strain) or re.match(r'[A|B]/[A-Za-z-]+/([A-Za-z0-9_-]+/){2}[0-9]{4}$', strain)\
or re.match(r'[A|B]/([A-Za-z-]+/){2}([0-9]+/){3}[0-9]{4}$', strain):
return True
else:
print("This strain name was not in the correct format and will be filtered out", strain, original_strain)
self.fix.add(strain)
def fix_casing(self, doc, data_source):
'''
fix gisaid specific fields casing
'''
for field in ['originating_lab', 'submitting_lab']:
if field in doc and doc[field] is not None:
doc[field] = doc[field].replace(' ', '_').replace('-', '_').lower()
for field in ['gender', 'host', 'locus']:
if field in doc and doc[field] is not None:
doc[field] = self.camelcase_to_snakecase(doc[field])
doc[field] = doc[field].lstrip("_").rstrip("_")
if (doc.get('accession') is not None and
not doc['accession'].startswith('EPI') and
data_source == 'gisaid'):
doc['accession'] = 'EPI' + doc['accession']
if 'accession' in doc and doc['accession'] is not None and data_source == 'ird':
doc['accession'] = doc['accession']
if 'isolate_id' in doc and doc['isolate_id'] is not None:
doc['isolate_id'] = doc['isolate_id'].lstrip("_").rstrip("_")
if 'submitting_lab' in doc and doc['submitting_lab'] is not None:
doc['submitting_lab'] = doc['submitting_lab'].lstrip("_").rstrip("_")
def fix_age(self, doc):
'''
Combine gisaid age information into one age field
'''
temp_age, temp_age_unit = None, None
doc['age'] = None
if 'Host_Age' in doc:
try:
temp_age = str(int(float(doc['Host_Age'])))
except:
pass
del doc['Host_Age']
if 'Host_Age_Unit' in doc:
if isinstance(doc['Host_Age_Unit'], str):
temp_age_unit = doc['Host_Age_Unit'].lower()
else:
temp_age_unit = 'y'
del doc['Host_Age_Unit']
if isinstance(temp_age, str) and isinstance(temp_age_unit, str):
doc['age'] = temp_age + temp_age_unit
return doc
def define_location_fixes(self, fname):
'''
Open location fix file and define corresponding dictionaries
'''
reader = csv.DictReader(filter(lambda row: row[0]!='#', open(fname)), delimiter='\t')
fix_location = {}
for line in reader:
fix_location[line['label'].encode().decode('unicode-escape')] = line['fix']
return fix_location
def define_location_label_fixes(self, fname):
reader = csv.DictReader(filter(lambda row: row[0]!='#', open(fname)), delimiter='\t')
self.label_to_fix = {}
for line in reader:
self.label_to_fix[line['label'].encode().decode('unicode-escape').replace(' ', '').lower()] = line['fix']
def fix_name(self, name):
'''
Fix strain names
'''
# replace all accents with ? mark
original_name = name.encode('ascii', 'replace').decode('unicode-escape')
# return original_name, original_name
# Replace whole strain names
name = self.replace_strain_name(original_name, self.fix_whole_name)
name = name.replace('H1N1', '').replace('H5N6', '').replace('H3N2', '').replace('H5N1', '').replace('H7N9', '').replace('H9N2', '')\
.replace('Influenza A Virus', '').replace('segment 4 hemagglutinin (HA) gene', '').replace("segment 6 neuraminidase (NA) gene", "")\
.replace('Human', '').replace('human', '').replace('//', '/').replace('.', '').replace(',', '').replace('&', '').replace(' ', '_')\
.replace('\'', '').replace('>', '').replace('-like', '').replace('+', '').replace('_','').replace('-','') # above at end used to be .replace(' ', '')
name = name.lstrip('-').lstrip('_').lstrip(')').lstrip('(')
name = name.lstrip('-').rstrip('_').rstrip(')').rstrip('(')
split_name = name.split('/')
# check location labels in strain names for fixing
# for this first check for the location fixes, only check and replace the beginning
# of the strain name, avoiding the last 2 splits that contain the random id and the year.
# This is to avoid issues where the random id happens to match a location like
# "A/chicken/Hubei/wi/1997" getting converted to "A/chicken/Hubei/Wisconsin/1997"
for index, label in enumerate(split_name[:-2]):
if label.replace(' ', '').lower() in self.label_to_fix:
split_name[index] = self.label_to_fix[label.replace(' ', '').lower()]
name = '/'.join(split_name)
name = self.flu_fix_patterns(name)
# Strip leading zeroes, change all capitalization location field to title case
split_name = name.split('/')
if len(split_name) == 4:
if split_name[1].isupper() or split_name[1].islower():
split_name[1] = split_name[1].title() # B/WAKAYAMA-C/2/2016 becomes B/Wakayama-C/2/2016
split_name[2] = split_name[2].lstrip('0') # A/Mali/013MOP/2015 becomes A/Mali/13MOP/2015
split_name[3] = split_name[3].lstrip('0') # A/Cologne/Germany/01/2009 becomes A/Cologne/Germany/1/2009
result_name = '/'.join(split_name).strip()
original_name = original_name.replace(" ","_")
return result_name, original_name
def flu_fix_patterns(self, name):
# various name patterns that need to be fixed
# capitalization of virus type
if re.match(r'([a|b])([\w\s\-/]+)', name): #b/sydney/508/2008 B/sydney/508/2008
name = re.match(r'([a|b])([\w\s\-/]+)', name).group(1).upper() + re.match(r'([a|b])([\w\s\-/]+)', name).group(2)
# remove inner parentheses and their contents
if re.match(r'([^(]+)[^)]+\)(.+)', name): # A/Egypt/51(S)/2006
name = re.match(r'([^(]+)[^)]+\)(.+)', name).group(1) + re.match(r'([^(]+)[^)]+\)(.+)', name).group(2)
# remove ending parentheses and their contents
if re.match(r'([^(]+)[^)]+\)$', name): # A/Eskisehir/359/2016 (109) -> A/Eskisehir/359/2016 ; A/South Australia/55/2014 IVR145 (14/232) -> A/South Australia/55/2014 IVR145
name = re.match(r'([^(]+)[^)]+\)$', name).group(1)
# Strip trailing slashes
name = name.rstrip('/') # A/NorthernTerritory/60/68// A/Paris/455/2015/
# Change two digit years to four digit years
if re.match(r'([\w\s\-/]+)/([0-9][0-9])$', name): #B/Florida/1/96 -> B/Florida/1/1996
year = re.match(r'([\w\s\-/]+)/([0-9][0-9])$', name).group(2)
if int(year) < 66:
name = re.match(r'([\w\s\-/]+)/([0-9][0-9])$', name).group(1) + "/20" + year
else:
name = re.match(r'([\w\s\-/]+)/([0-9][0-9])$', name).group(1) + "/19" + year
return name
def format_gisaid_clade(self, v):
if v['gisaid_clade'] is not None:
v['gisaid_clade'] = v['gisaid_clade'].strip().lower()
def format_domestic_status(self, v):
if v['domestic_status'] is not None:
v['domestic_status'] = v['domestic_status'].strip().lower()
def format_animal_health_status(self, v):
if v['animal_health_status'] is not None:
v['animal_health_status'] = v['animal_health_status'].strip().lower()
def format_authors(self, v):
if v['authors'] is not None:
v['authors'] = v['authors'].replace("\r","").replace("\n","")
def format_host(self, v):
'''
Fix host formatting
'''
avian_list = [
"accipitercooperii","accipitergentilis", "accipiternisus", "accipitertrivirgatus","aixsponsa",
"african__stonechat", "aixgalericulata", "alectorischukar", "american__black__duck",
"americanpelican","american__wigeon","americanwigeon", "anade",
"anassibilatrix","anasboschas", "anasacuta", "anasamericana", "anasfalcata",
"anaspenelope","anasflavirostris","ansersp.","anseriformessp.","anasquerquedula",
"anseranserdomesticus", "anserbrachyrhynchus", "ansercanagica","ansercaerulescens" ,
"ansercygnoides","anasplatyrhynchosf.domestica","anas_platyrhynchos",
"anascarolinensis", "anasclypeata", "anascrecca", "anascyanoptera",
"anasdiscors"," anasfalcata","anasgeorgica","anasformosa", "anasplatyrhynchos", "anaspoecilorhyncha",
"anasrubripes", "anassp.", "anasstrepera", "anasstrepera", "anasplatyrhynchosvar.domesticus",
"anasundalata", "anseranser", "anserfabalis", "anseralbifrons", "anthropoidesvirgo",
"anserindicus", "arenariainterpres","ardeacinerea","anaszonorhyncha","aythyaaffinis",
"anserrossii",
"aythyamarila","aythyafuligula","aythyaamericana","aythya_americana","aythyanyroca","aythyacollaris","aythyaferina",
"avian","baldeagle", "bar__headed__goose","barnacle_goose", "beangoose","bird",
"barn__swallow", "blackvulture","black vulture","brantabernicla","brown__headed__gull", "bucephalaclangula", "buteo",
"baikal__teal", "bewick's__swan", "black__billed__magpie", "babbler","black-headedgull",
"buteobuteo","buteojamaicensis","buteojaponicus",
"blue__winged__teal","blue-wingedteal","bluegoose",
"brantahutchinsii","brantacanadensis","brantaleucopsis","buteolineatus",
"cairinamoschata", "calidrisalba","calidris_canutus","calidriscanutus","calidrisminutilla","canada__goose","chencaerulescens",
"chencanagica", "chicken", "chukar", "chroicocephalusridibundus","ciconiaciconia","common__pochard",
"common__goldeneye", "common__coot", "common__pheasant", "commonteal", "common_teal","condor",
"cooper'shawk","cormorant", "corvus", "copsychussaularis","corvusmacrorhynchos",
"coturnix", "coturnixsp.", "coturniccoturnix", "coturnixjaponica","chlidoniashybridus",
"crane", "crow", "cygnus","cyrtonyxmontezumai","curlew", "cygnusatratus", "chinese__francolin",
"chroicocephaluscirrocephalus","corvusfrugilegus","chlidoniashybridus","circusaeruginosus,"
"corvussplendens", "cygnuscolumbianus", "cygnuscygnus","cygnus_cygnus", "cygnusolor",
"dendrocygnaviduata","dendrocygnaautumnalis","domesticgoose","duck", "dove",
"eagle", "egret","egyptiangoose","eurasiancurlew","eurasian__eagel__owl","emperorgoose",
"eurasian__wigeon", "falco", "falcon",
"falcoperegrinus", "finch", "francolinus", "fowl",
"falcotinnunculus", "falscorusticolus",
"gadwall", "gallinulachloropus","gallus", "gallusgallus", "gallusgallusdomesticus",
"gallinagogallinago",
"goose", "graculareligiosa", "great__black__headed__gull", "grey_teal","greyteal","garrulaxcanorus", "garganey",
"glaucous-wingedgull","greygull","glaucousgull",
"great__crested__grebe", "greatcrestedgrebe","greatbustard", "great__bustard","greattit",
"greater__white__fronted__goose", "greylaggoose","greylag_goose" "grebe",
"green__winged__teal","green-wingedteal", "grey__teal",
"grey__heron", "guineafowl", "gull","halietusleucocephalus","haliaeetusleucocephalus",
"halietusalbicilla","himantopushimantopusmelanurus","larusfuscus",
"heron", "herringgull","hirundorustica", "houbara__bustard", "japanese__white__eye", "japanese__quail",
"larusarmenicus","larusschistisagus", "larussmithsonianus","larusargentatus", "larusbrunnicephalus",
"larusglaucescens","larusmarinus","larusmelanocephalus","laruscachinnans","larosternainca",
"larusatricilla", "laruscanus", "larusdelawarensis","larusdominicanus","laughing__gull","larus",
"larusichthyaetus", "larusridibundus", "larusridibundus", "leucophaeusatricilla",
"leucophaeus","little__grebe",
"little__egret", "lophuranycthemera", "lophodytescucullatus","lophodytescucullatus",
"magpie", "magpie__robin", "mallard",
"mallardduck","marecapenelope","murre",
"morphnusguianensis", "mulardduck","mute__swan", "muscovy__duck", "myna", "meleagrisgallopavo",
"necrosyrtesmonachus", "nisaetusnipalensis","northernpintail",
"northern__shoveler", "northernshoveler","numidasp.","northernpintail",
"northern__pintail", "numidameleagris","numeniusarquata",
"openbill__stork","oreortyx", "ostrich", "oystercatcher", "otheravian",
"parabuteo","parabuteounicinctus",
"partridge", "passerdomesticus", "parakeet", "parrot", "passerine", "passermontanus",
"pavocristatus", "peacock","peafowl", "phasianuscolchicus", "phasianus",
"phasaniussp.","pheasant","phasaniuscolchicus","pelican","pelecanus",
"penguin","peregrine__falcon", "picapica","pica","pigeon", "pink__footed__goose",
"polyplectronbicalcaratum", "podicepscristatus",
"poultry","pygoscelisantarcticus","rissatridactyla","rynchopsniger",
"quail", "rails","rail","ring-neckedduck","rook", "ruddy__turnstone", "ruddyturnstone",
"ruddyshelduck","rosy__billed__pochard","sacredibis",
"saker__falcon", "sanderling","sandpiper", "scolopaxrusticola","shrike",
"shorebird", "silky__chicken",
"silverteal", "snow__goose","somateriamollissima",
"sparrow", "speckledpigeon","starling", "sternasandvicensis","swan", "sterna",
"sternahirundo","sternaparadisaea",
"streptopeliadecaocto",
"stork", "swiftlet",
"tachybaptusruficollis","tadornaferuginea","tadornatadorna",
"teal", "turkey", "tern","turtledove", "tree__sparrow", "turnstone","us_quail", "waterbird","waterfowl",
"wild__turkey", "wildwaterfowl","white__bellied__bustard",
"white-frontedgoose", "white-frontedgoose",
"wild__chicken","wild__duck","wildbirds",
"whooper__swan","whooperswan", "wildbird", "yellow__billed__duck", "zosteropsjaponicus"]
environment_list = [
"feces", "otherenvironment", "surfaceswab", "watersample", "environment",
"airsample"]
cattle_list = ["dairycattle", "cattle", "cow", "bovine", "dairycow"]
nonhuman_mammal_list = [
"bat", "canine", "equine", "feline", "harbourseal","mammals", "mink", "othermammals",
"primate","swine","pig", "susscrofadomesticus", "lion", "weasel", "raccoon__dog", "tiger",
"dog", "large__cat", "mouse","murine","pika","seal","meerkat", "cat","feliscatus", "rousettusaegyptiacus","rodent"]
other_list = [
"circus", "ferret", "insect", "laboratoryderived", "unknown", "animal","host"]
if v['host'] is not None:
if v['host'] in avian_list:
v['host'] = "avian"
elif v['host'] in environment_list:
v['host'] = "environment"
elif v['host'] in cattle_list:
v['host'] = "cattle"
#adding in extra logic to find cattle sequences that are annotated as mammal but are cattle
elif v['host'] in nonhuman_mammal_list:
if len(v['strain'].split("/")) == 5:
species = v['strain'].split("/")[1]
if species in cattle_list:
v['host'] = "cattle"
else:
v['host'] = "nonhuman_mammal"
elif v['host'] in other_list:
v['host'] = "other"
elif v['host'] in ['human']:
v['host'] = "human"
# if no host attribute, but there is a host in strain name
elif v['host'] == '' and len(v['strain'].split("/")) == 5:
hostspecies = v['strain'].split("/")[1]
if hostspecies in avian_list:
v['host'] = "avian"
elif hostspecies in environment_list:
v['host'] = 'environment'
elif hostspecies in nonhuman_mammal_list:
v['host'] = 'nonhuman_mammal'
elif hostspecies in other_list:
v['host'] = 'other'
else:
print("cannot classify host for", v['strain'], v['host'])
def format_country(self, v, data_source):
'''
Label viruses with country based on strain name
A/Taiwan/1/2013 is human virus. Four fields total. Take second field.
A/Chicken/Taiwan/1/2013 is animal virus. Five field total. Take third field.
Else, take GISAID location.
'''
strain_name = v['strain']
original_name = v['gisaid_strain']
result = None
field_count = 0
if '/' in strain_name:
field_count = len(strain_name.split('/'))
if field_count == 4:
loc = strain_name.split('/')[1].replace(" ", "")
result = self.determine_location(loc)
elif field_count == 5:
loc = strain_name.split('/')[2].replace(" ", "")
result = self.determine_location(loc)
else:
loc = None
print("improperly formatted strain name, ", strain_name, original_name)
"""there are some old avian viruses whose strain names are incorrectly formatted
and are missing a strain identifier. Therefore, they are only 4 fields long instead
of 5. For most of these, the location errors out, but for Turkey viruses, because
Turkey is an actual country, these will be mislabelled as West Asian. This line
will check whether the strain name is only 4 fields and has turkey in it, and if
the location field is not also Turkey, it will print out this error message"""
if len(strain_name.split('/')) == 4 and "turkey" in strain_name.lower():
print("check location for", strain_name, "original strain name: ", original_name, "location ",loc)
"""perform a check for sequences for Georgia to determine whether they are from the
country or the US state. If from gisaid and location is Georgia, check the region.
If region == asia, set location to georgia country"""
if loc is not None:
if loc.lower() == "georgia":
if data_source == "gisaid":
if v['gisaid_location'] is not None:
region = v['gisaid_location'].split('/')[0].replace(" ", "")
if region.lower() == "asia":
loc = "GeorgiaCountry"
else:
loc = loc
result = self.determine_location(loc)
if data_source == "gisaid":
if v['gisaid_location'] is not None and result is None:
loc = v['gisaid_location'].split('/')[-1].replace(" ", "")
result = self.determine_location(loc)
if data_source == 'ird':
if field_count == 4 and v['host'].lower() == 'human':
loc = strain_name.split("/")[1]
elif field_count == 4 and v['host'].lower() != 'human':
loc = strain_name.split("/")[2]
elif field_count == 5:
loc = strain_name.split("/")[2]
else:
loc = None
result = self.determine_location(loc)
if result is not None:
v['location'], v['division'], v['country'] = result
else:
v['location'], v['division'], v['country'] = None, None, None
print("couldn't parse country for ", original_name, strain_name)
if v['division'] == v['country']:
v['division'] == '?'
if v['division'] == v['country']:
v['division'] == '?'
def format_passage(self, doc, initial_field, new_field, **kwargs):
'''
Separate passage into general categories
Regex borrowed from McWhite et al. 2016
'''
if initial_field in doc and doc[initial_field] is not None:
passage = doc[initial_field].upper()
passage_category = "undetermined"
if re.search(r'AM[1-9]|E[1-9]|AMNIOTIC|EGG|EX|AM_[1-9]', passage): # McWhite
passage_category = "egg"
elif re.search(r'AM-[1-9]|EMBRYO|^E$', passage):
passage_category = "egg"
elif re.search(r'LUNG|P0|OR_|ORIGINAL|CLINICAL|DIRECT', passage): # McWhite
passage_category = "unpassaged"
elif re.search(r'ORGINAL|ORIGNAL|CLINCAL|THROAT|PRIMARY|NASO|AUTOPSY|BRONCHIAL|INITIAL|NASAL|NOSE|ORIG|SWAB', passage):
passage_category = "unpassaged"
elif re.search(r'TMK|RMK|RHMK|RII|PMK|R[1-9]|RX', passage): # McWhite
passage_category = "cell"
elif re.search(r'S[1-9]|SX|SIAT|MDCK|MCDK|C[1-9]|CX|M[1-9]|MX|X[1-9]|^X_$', passage): # McWhite
passage_category = "cell"
elif re.search(r'C_[1-9]|C [1-9]|MD[1-9]|MK[1-9]|MEK[1-9]', passage):
passage_category = "cell"
elif re.search(r'[Cc][Ee][Ll][Ll]', passage):
passage_category = "cell"
elif re.search(r'^S[1-9]_$| ^SX_$|SIAT2_SIAT1|SIAT3_SIAT1', passage): # McWhite
passage_category = "cell"
elif re.search(r'UNKNOWN|UNDEFINED|NOT SPECIFIED|DIFFERENT ISOLATION SOURCES', passage):
pass
doc[new_field] = passage_category
else:
doc[initial_field] = None
doc[new_field] = None
def determine_group_fields(self, v, patterns, **kwargs):
'''
Determine and assign genetic group fields
'''
# determine virus type from strain name
v['vtype'], v['subtype'], v['lineage'] = 'tbd', 'tbd', 'tbd'
temp_subtype = ''
temp_lineage = ''
if 'Subtype' in v:
if v['Subtype'] is not None:
temp_subtype = v['Subtype'].lower()
del v['Subtype']
if 'Lineage' in v:
if v['Lineage'] is not None:
temp_lineage = v['Lineage'].lower()
del v['Lineage']
if (temp_subtype, temp_lineage) in patterns: #look for pattern from GISAID fasta file
match = patterns[(temp_subtype, temp_lineage)]
v['vtype'], v['subtype'], v['lineage'] = match[0], match[1], match[2]
return v
if __name__=="__main__":
args = parser.parse_args()
if (args.data_source == 'gisaid'):
sequence_fasta_fields = {0: 'accession', 1: 'strain', 2: 'isolate_id', 3:'locus', 4: 'passage', 5: 'INSDC_accession'}
#gisaid fasta fields:
# DNA Accession no. | Isolate name | Isolate ID | Segment
setattr(args, 'fasta_fields', sequence_fasta_fields)
xls_fields_wanted = [('strain', 'Isolate_Name'), ('isolate_id', 'Isolate_Id'), ('collection_date', 'Collection_Date'),
('host', 'Host'), ('Subtype', 'Subtype'), ('Lineage', 'Lineage'),
('gisaid_location', 'Location'), ('originating_lab', 'Originating_Lab'), ('Host_Age', 'Host_Age'),
('Host_Age_Unit', 'Host_Age_Unit'), ('gender', 'Host_Gender'), ('submission_date', 'Submission_Date'),
('submitting_lab', 'Submitting_Lab'), ('authors','Authors'), ('domestic_status','Domestic_Status'),
('PMID','PMID'), ('animal_health_status','Animal_Health_Status'), ('gisaid_clade','Clade')]
setattr(args, 'xls_fields_wanted', xls_fields_wanted)
elif (args.data_source == 'ird'):
virus_fasta_fields = {0:'strain', 4: 'vtype', 5: 'Subtype', 6:'collection_date', 8:'country', 10: 'host', 11:'h5_clade'}
sequence_fasta_fields = {0:'strain', 1:'accession', 2:'locus'}
# 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
#>A/American_green_winged_teal/Washington/195750/2014|KP739418|1|PB2|A|H5N1|12/29/2014|14_15|USA|Washington|Green_Winged_Teal|N|AdmantaneResistance_Yes|OseltamivirResistance_No|IncreasedVirulence_Yes|EnhancedTransmission_Yes|T92E_No|No|NA|NA|2.3.4.4
setattr(args, 'virus_fasta_fields', virus_fasta_fields)
setattr(args, 'sequence_fasta_fields', sequence_fasta_fields)
if args.path is None:
args.path = "data/"
if not os.path.isdir(args.path):
os.makedirs(args.path)
connVDB = flu_upload(**args.__dict__)
connVDB.upload(**args.__dict__)