Permalink
Browse files

Improve parsing of H5N1 viruses

  • Loading branch information...
trvrb committed Jan 4, 2019
1 parent 41511bc commit 6592e6f6cf59b77fe4e6101da7f1d2d2615e5244
@@ -210,6 +210,7 @@ Cote d'Ivorie CoteDIvoire
CoteDIvorie CoteDIvoire
YopougonGR926 Yopougon/GR926
Korogho Korhogo
IvoryCoast CoteDIvoire

# CzechRepublic
CzechRep CzechRepublic
@@ -69,3 +69,4 @@ A/Saitama/103(E20) A/Saitama/103/2014
A/MANITOBA/RV27522017 A/Manitoba/RV2752/2017
A/SouthAfrica/R07338/18 A/SouthAfrica/R07338/2018
A/North-WestAuckland/1/2018 A/NorthWestAuckland/1/2018
A/chicken/F611/Egypt/2015(H5N1) A/chicken/Egypt/F611/2015
@@ -317,6 +317,7 @@ BurkinaFaso BurkinaFaso BurkinaFaso BurkinaFaso
# Cambodia
Cambodia Cambodia Cambodia Cambodia
Kampongchamhospital Cambodia KampongCham KampongCham
PhenomPenh Cambodia PhenomPenh PhenomPenh

# Cameroon
Cameroon Cameroon Cameroon Cameroon
@@ -739,6 +740,10 @@ Damietta Egypt Damietta Damietta
Dakahlia Egypt Dakahlia Dakahlia
Aswan Egypt Aswan Aswan
Helwan Egypt Cairo Helwan
Ismailia Egypt Ismailia Ismailia
Gharbia Egypt Gharbia Gharbia
Giza Egypt Giza Giza
NorthSinai Egypt NorthSinai NorthSinai

# ElSalvador
ElSalvador ElSalvador ElSalvador ElSalvador
@@ -1121,19 +1126,38 @@ Yavatmal India Maharashtra Yavatmal
Indonesia Indonesia Indonesia Indonesia
Bali Indonesia Bali Bali
Banten Indonesia Banten Banten
Banyuwangi Indonesia Java Banyuwangi
Bengkulu Indonesia Bengkulu Bengkulu
Bogorwestjava Indonesia WestJava Bogor
CentralJava Indonesia Java CentralJava
EastJava Indonesia Java EastJava
Gorontalo Indonesia Sulawesi Gorontalo
Jakarta Indonesia Jakarta Jakarta
Jambi Indonesia Sumatra Jambi
Java Indonesia Java Java
Kediri Indonesia Java Kediri
Klaten Indonesia Java Klaten
Kuningan Indonesia WestJava Kuningan
LampungUtara Indonesia Sulawesi LampungUtara
Lamongan Indonesia Java Lamongan
Lampung Indonesia Sumatra Lampung
Lumajang Indonesia Java Lumajang
Bandungjava Indonesia Java Bandung
Makassar Indonesia Makassar Makassar
NorthSumatra Indonesia Sumatra NorthSumatra
Palembangsumatra Indonesia Sumatra Palembang
Raas Indonesia EastJava Raas
Semarang Indonesia Java Semarang
SouthSumatra Indonesia Sumatra SouthSumatra
Sumatra Indonesia Sumatra Sumatra
Surabaya Indonesia JawaTimur Surabaya
Serpong Indonesia Banten Serpong
Sidrap Indonesia Sulawesi Sidrap
Surakarta Indonesia Jateng Surakarta
Sulawesi Indonesia Sulawesi Sulawesi
Sukoharjo Indonesia Java Sukoharjo
WestJava Indonesia Java WestJava
Yogyakarta Indonesia Java Yogyakarta

# Iran
Iran Iran Iran Iran
@@ -1848,11 +1872,13 @@ Birobidzhan Russia JewishOblast Birobidzhan
Blagoveshchensk Russia Amur Blagoveshchensk
Bryansk Russia Bryansk Bryansk
Buryatia Russia Buryatia Buryatia
Chany Russia Novosibirsk Chany
Cheboksary Russia Chuvash Cheboksary
Chelyabinsk Russia Chelyabinsk Chelyabinsk
Cherkessk Russia Karachay-Cherkess Cherkessk
Chita Russia ZabaykalskyKrai Chita
Dagestan Russia Dagestan Dagestan
Dovolnoe Russia Novosibirsk Dovolnoe
Ekaterinburg Russia SverdlovskOblast Ekaterinburg
GornoAltaysk Russia Altaia GornoAltaysk
Irkutsk Russia Irkutsk Irkutsk
@@ -1901,6 +1927,7 @@ Podporozhie Russia Podporozhie Podporozhie
RostovOnDon Russia Rostov RostovOnDon
Rostov Russia Rostov Rostov
Ryazan Russia Ryazan Ryazan
Sartlan Russia Novosibirsk Sartlan
StPetersburg Russia StPetersburg StPetersburg
Salekhard Russia Yamalia Salekhard
Samara Russia Samara Samara
@@ -2654,6 +2681,8 @@ KhanhHoa Vietnam KhanhHoa KhanhHoa
LangSon Vietnam LangSon LangSon
LongAn Vietnam LongAn LongAn
Mekongdelta Vietnam MekongDelta MekongDelta
QuanhNinh Vietnam QuanhNinh QuanhNinh
SocTrang Vietnam SocTrang SocTrang
TayNguyen Vietnam DakLak TayNguyen
TayNinh Vietnam TayNinh TayNinh
ThaiBinh Vietnam ThaiBinh ThaiBinh
@@ -283,15 +283,6 @@ def flu_fix_patterns(self, name):
# remove ending parentheses and their contents
if re.match(r'([^(]+)[^)]+\)$', name): # A/Eskisehir/359/2016 (109) -> A/Eskisehir/359/2016 ; A/South Australia/55/2014 IVR145 (14/232) -> A/South Australia/55/2014 IVR145
name = re.match(r'([^(]+)[^)]+\)$', name).group(1)
# Remove info B/Vic strain info from name
if re.match(r'([\w\s\-/]+)(\(?)(B/Victoria/2/87|B/Victoria/2/1987)$', name): # B/Finland/150/90 B/Victoria/2/1987 -> B/Finland/150/90
name = re.match(r'([\w\s\-/]+)(\(?)(B/Victoria/2/87|B/Victoria/2/1987)$', name).group(1)
# Separate location info from ID info in strain name
if re.match(r'([A|B]/[^0-9/]+)([0-9]+[A-Za-z]*/[0-9/]*[0-9]{2,4})$', name): #A/Iceland183/2009 A/Baylor4A/1983 A/Beijing262/41/1994
name = re.match(r'([A|B]/[^0-9/]+)([0-9]+[A-Za-z]*/[0-9/]*[0-9]{2,4})$', name).group(1) + "/" + re.match(r'([A|B]/[^0-9/]+)([0-9]+[A-Za-z]*/[0-9/]*[0-9]{2,4})$', name).group(2)
# Remove characters after year info, associated with passage info but can parse that from passage field later
if re.match(r'([A|B]/[A-Za-z-]+/[A-Za-z0-9_-]+/[0-9]{4})(.)+$', name): # B/California/12/2015BX59B A/Shanghai/11/1987/X99/highyieldingreassortant
name = re.match(r'([A|B]/[A-Za-z-]+/[A-Za-z0-9_-]+/[0-9]{4})(.)+$', name).group(1)
# Strip trailing slashes
name = name.rstrip('/') # A/NorthernTerritory/60/68// A/Paris/455/2015/
# Change two digit years to four digit years
@@ -328,14 +319,42 @@ def format_host(self, v):
v['host'] = "avian"
if v['host'] == "anascrecca":
v['host'] = "avian"
if v['host'] == "anasstrepera":
v['host'] = "avian"
if v['host'] == "passerine":
v['host'] = "avian"
if v['host'] == "larusridibundus":
v['host'] = "avian"
if v['host'] == "anascarolinensis":
v['host'] = "avian"
if v['host'] == "us_quail":
v['host'] = "avian"
if v['host'] == "goose":
v['host'] = "avian"
if v['host'] == "anasrubripes":
v['host'] = "avian"
if v['host'] == "anasamericana":
v['host'] = "avian"
if v['host'] == "corvus":
v['host'] = "avian"
if v['host'] == "falcoperegrinus":
v['host'] = "avian"
if v['host'] == "zosteropsjaponicus":
v['host'] = "avian"
if v['host'] == "cygnuscygnus":
v['host'] = "avian"
if v['host'] == "falcon":
v['host'] = "avian"
if v['host'] == "eagle":
v['host'] = "avian"
if v['host'] == "turkey":
v['host'] = "avian"
if v['host'] == "graculareligiosa":
v['host'] = "avian"
if v['host'] == "chencanagica":
v['host'] = "avian"
if v['host'] == "anserindicus":
v['host'] = "avian"
if v['host'] == "passermontanus":
v['host'] = "avian"
if v['host'] == "arenariainterpres":
@@ -344,12 +363,16 @@ def format_host(self, v):
v['host'] = "avian"
if v['host'] == "avian":
v['host'] = "avian"
if v['host'] == "coturnix":
v['host'] = "avian"
if v['host'] == "guineafowl":
v['host'] = "avian"
if v['host'] == "cairinamoschata":
v['host'] = "avian"
if v['host'] == "anascyanoptera":
v['host'] = "avian"
if v['host'] == "feline":
v['host'] = "nonhuman_mammal"
if v['host'] == "watersample":
v['host'] = "environment"
if v['host'] == "surfaceswab":

0 comments on commit 6592e6f

Please sign in to comment.