-
Notifications
You must be signed in to change notification settings - Fork 10
/
config.yaml
108 lines (107 loc) · 3.86 KB
/
config.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
# Sources of sequences to include in the ingest run
sources: ['genbank']
# Pathogen NCBI Taxonomy ID
ncbi_taxon_id: '12637'
# The list of NCBI Datasets fields to include from NCBI Datasets output
# These need to be the mneumonics of the NCBI Datasets fields, see docs for full list of fields
# https://www.ncbi.nlm.nih.gov/datasets/docs/v2/reference-docs/command-line/dataformat/tsv/dataformat_tsv_virus-genome/#fields
# Note: the "accession" field MUST be provided to match with the sequences
ncbi_datasets_fields:
- accession
- sourcedb
- isolate-lineage
- geo-region
- geo-location
- isolate-collection-date
- release-date
- update-date
- virus-tax-id
- virus-name
- length
- host-name
- isolate-lineage-source
- submitter-names
- submitter-affiliation
# Params for the transform rule
transform:
# NCBI Fields to rename to Nextstrain field names.
# This is the first step in the pipeline, so any references to field names
# in the configs below should use the new field names
field_map: [
'accession=genbank_accession',
'accession-rev=genbank_accession_rev',
'isolate-lineage=strain',
'sourcedb=database', # necessary for applying geo location rules
'geo-region=region',
'geo-location=location',
'host-name=host',
'isolate-collection-date=date',
'release-date=release_date',
'update-date=update_date',
'virus-tax-id=virus_tax_id',
'virus-name=virus_name',
'sra-accs=sra_accessions',
'submitter-names=authors',
'submitter-affiliation=institution',
]
# Standardized strain name regex
# Currently accepts any characters because we do not have a clear standard for strain names
strain_regex: '^.+$'
# Back up strain name field if 'strain' doesn't match regex above
strain_backup_fields: ['genbank_accession']
# List of date fields to standardize
date_fields: ['date', 'release_date', 'update_date']
# Expected date formats present in date fields
# These date formats should use directives expected by datetime
# See https://docs.python.org/3.9/library/datetime.html#strftime-and-strptime-format-codes
expected_date_formats: ['%Y', '%Y-%m', '%Y-%m-%d', '%Y-%m-%dT%H:%M:%SZ']
# Titlecase rules
titlecase:
# Abbreviations not cast to titlecase, keeps uppercase
abbreviations: ['USA']
# Articles that should not be cast to titlecase
articles: [
'and', 'd', 'de', 'del', 'des', 'di', 'do', 'en', 'l', 'la', 'las', 'le',
'los', 'nad', 'of', 'op', 'sur', 'the', 'y'
]
# List of string fields to titlecase
fields: ['region', 'country', 'division', 'location']
# Authors field name
authors_field: 'authors'
# Authors default value if authors value is empty
authors_default_value: '?'
# Field name for the generated abbreviated authors
abbr_authors_field: 'abbr_authors'
# General geolocation rules to apply to geolocation fields
geolocation_rules_url: 'https://raw.githubusercontent.com/nextstrain/ncov-ingest/master/source-data/gisaid_geoLocationRules.tsv'
# Local geolocation rules that are only applicable to mpox data
# Local rules can overwrite the general geolocation rules provided above
local_geolocation_rules: 'source-data/geolocation-rules.tsv'
# User annotations file
annotations: 'source-data/annotations.tsv'
# ID field used to merge annotations
annotations_id: 'genbank_accession'
# Field to use as the sequence ID in the FASTA file
id_field: 'genbank_accession'
# Field to use as the sequence in the FASTA file
sequence_field: 'sequence'
# Final output columns for the metadata TSV
metadata_columns: [
'strain',
'genbank_accession',
'genbank_accession_rev',
'date',
'region',
'country',
'division',
'location',
'length',
'host',
'release_date',
'update_date',
'ncbi_serotype', # inferred from virus_tax_id
'sra_accessions',
'abbr_authors',
'authors',
'institution'
]