Skip to content
Permalink
Browse files

Use location fixes for flu upload

  • Loading branch information
trvrb committed Dec 30, 2019
1 parent 17e18db commit 0fe70bda54cfd40936d351f4525198397d5ac28c
Showing with 22 additions and 3 deletions.
  1. +1 −0 source-data/flu_location_fix.tsv
  2. +21 −3 vdb/flu_upload.py
@@ -327,3 +327,4 @@ A/Duck/Sichuan/408 sichuan
A/Chicken/Hunan/207 hunan
A/chicken/Jiangsu/JS/02/71/2017 jiangsu
A/Chicken/Hubei/306 hubei
A/England/321/2019 england
@@ -48,6 +48,8 @@ def __init__(self, **kwargs):
'Vic': ('b', None, 'seasonal_vic'),
'Yam': ('b', None, 'seasonal_yam')}
self.strain_fix_fname = "source-data/flu_strain_name_fix.tsv"
self.location_fix_fname = "source-data/flu_location_fix.tsv"
self.location_label_fix_fname = "source-data/flu_fix_location_label.tsv"
self.virus_to_sequence_transfer_fields = ['submission_date']
self.fix = set()

@@ -121,9 +123,12 @@ def format_viruses(self, documents, **kwargs):
'''
if self.strain_fix_fname is not None:
self.fix_whole_name = self.define_strain_fixes(self.strain_fix_fname)
if self.location_fix_fname is not None:
self.fix_location = self.define_location_fixes(self.location_fix_fname)
if self.location_label_fix_fname is not None:
self.fix_location_label = self.define_location_label_fixes(self.location_label_fix_fname)
self.define_countries("source-data/geo_synonyms.tsv")
self.define_regions("source-data/geo_regions.tsv")
self.define_location_fixes("source-data/flu_fix_location_label.tsv")
for doc in documents:
if 'strain' in doc:
doc['strain'], doc['gisaid_strain'] = self.fix_name(doc['strain'])
@@ -134,7 +139,10 @@ def format_viruses(self, documents, **kwargs):
self.determine_group_fields(doc, self.patterns, kwargs['subtype'])
self.format_date(doc)
self.format_country(doc)
self.format_place(doc, determine_location=False)
if self.fix_location is not None: # override with fixes
if doc['strain'] in self.fix_location:
doc['location'] = self.fix_location[doc['strain']]
self.format_place(doc, determine_location=True)
self.format_region(doc)
self.rethink_io.check_optional_attributes(doc, [])

@@ -223,12 +231,22 @@ def fix_age(self, doc):
doc['age'] = temp_age + temp_age_unit
return doc

def define_location_fixes(self, fname):
def define_location_label_fixes(self, fname):
reader = csv.DictReader(filter(lambda row: row[0]!='#', open(fname)), delimiter='\t')
self.label_to_fix = {}
for line in reader:
self.label_to_fix[line['label'].decode('unicode-escape').replace(' ', '').lower()] = line['fix']

def define_location_fixes(self, fname):
'''
Open location fix file and define corresponding dictionaries
'''
reader = csv.DictReader(filter(lambda row: row[0]!='#', open(fname)), delimiter='\t')
fix_location = {}
for line in reader:
fix_location[line['label'].decode('unicode-escape')] = line['fix']
return fix_location

def fix_name(self, name):
'''
Fix strain names

0 comments on commit 0fe70bd

Please sign in to comment.
You can’t perform that action at this time.