Merge pull request #640 from nextstrain/support-default-gisaid-metadata

Support default GISAID metadata and sequences
nextstrain · May 27, 2021 · c1a2de4 · c1a2de4
2 parents 3e484b8 + e153db3
commit c1a2de4
Show file tree

Hide file tree

Showing 9 changed files with 526 additions and 101 deletions.
diff --git a/defaults/parameters.yaml b/defaults/parameters.yaml
@@ -10,6 +10,32 @@ strip_strain_prefixes:
   - hCoV-19/
   - SARS-CoV-2/
 
+sanitize_metadata:
+  rename_fields:
+    - "Virus name=strain"
+    - "Type=type"
+    - "Accession ID=gisaid_epi_isl"
+    - "Collection date=date"
+    - "Additional location information=additional_location_information"
+    - "Sequence length=length"
+    - "Host=host"
+    - "Patient age=patient_age"
+    - "Gender=sex"
+    - "Clade=GISAID_clade"
+    - "Pango lineage=pango_lineage"
+    - "Pangolin version=pangolin_version"
+    - "Variant=variant"
+    - "AA Substitutions=aa_substitutions"
+    - "aaSubtitutions=aa_substitutions"
+    - "Submission date=date_submitted"
+    - "Is reference?=is_reference"
+    - "Is complete?=is_complete"
+    - "Is high coverage?=is_high_coverage"
+    - "Is low coverage?=is_low_coverage"
+    - "N-Content=n_content"
+    - "GC-Content=gc_content"
+  parse_location_field: Location
+
 reference_node_name: "USA/WA1/2020"
 
 # Define files used for external configuration. Common examples consist of a

diff --git a/docs/change_log.md b/docs/change_log.md
@@ -3,6 +3,19 @@
 As of April 2021, we use major version numbers (e.g. v2) to reflect backward incompatible changes to the workflow that likely require you to update your Nextstrain installation.
 We also use this change log to document new features that maintain backward compatibility, indicating these features by the date they were added.
 
+## v7 (27 May 2021)
+
+For more details about this release, see [the configuration reference for the new "sanitize metadata" parameters](https://nextstrain.github.io/ncov/configuration.html#sanitize_metadata) and [the corresponding pull request](https://github.com/nextstrain/ncov/pull/640).
+
+### Major changes
+
+- Deduplicate metadata and sequences from each `inputs` dataset at the beginning of the workflow.
+
+### Features
+
+- Support full GISAID metadata and sequences from the "Download packages" interface by converting this default format into Nextstrain-compatible metadata and sequences.
+- Support reading metadata and sequences directly from GISAID's tar archives. For example, you can now define `inputs` as `metadata: data/ncov_north-america.tar.gz` and `sequences: data/ncov_north-america.tar.gz` to decompress and read the corresponding data from the archive.
+
 ## New features since last version update
 
  - 25 May 2021: Support custom Auspice JSON prefixes with a new configuration parameter, `auspice_json_prefix`. [See the configuration reference for more details](https://nextstrain.github.io/ncov/configuration.html#auspice_json_prefix). ([#643](https://github.com/nextstrain/ncov/pull/643))

diff --git a/docs/configuration.md b/docs/configuration.md
@@ -497,6 +497,44 @@ Valid attributes for list entries in `inputs` are provided below.
 * description: A list of prefixes to strip from strain names in metadata and sequence records to maintain consistent strain names when analyzing data from multiple sources.
 * default: `["hCoV-19/", "SARS-CoV-2/"]`
 
+## sanitize_metadata
+* type: object
+* description: Parameters to configure how to sanitize metadata to a Nextstrain-compatible format.
+
+### parse_location_field
+* type: string
+* description: Field in the metadata that stores GISAID-formatted location details (e.g., `North America / USA / Washington`) to be parsed into `region`, `country`, `division`, and `location` fields.
+* default: `Location`
+
+### rename_fields
+* type: array
+* description: List of key/value pairs mapping fields in the input metadata to rename to another value in the sanitized metadata.
+* default:
+```yaml
+    - "Virus name=strain"
+    - "Type=type"
+    - "Accession ID=gisaid_epi_isl"
+    - "Collection date=date"
+    - "Additional location information=additional_location_information"
+    - "Sequence length=length"
+    - "Host=host"
+    - "Patient age=patient_age"
+    - "Gender=sex"
+    - "Clade=GISAID_clade"
+    - "Pango lineage=pango_lineage"
+    - "Pangolin version=pangolin_version"
+    - "Variant=variant"
+    - "AA Substitutions=aa_substitutions"
+    - "aaSubtitutions=aa_substitutions"
+    - "Submission date=date_submitted"
+    - "Is reference?=is_reference"
+    - "Is complete?=is_complete"
+    - "Is high coverage?=is_high_coverage"
+    - "Is low coverage?=is_low_coverage"
+    - "N-Content=n_content"
+    - "GC-Content=gc_content"
+```
+
 ## subsampling
 * type: object
 * description: Schemes for subsampling data prior to phylogenetic inference to avoid sampling bias or focus an analysis on specific spatial and/or temporal scales. [See the SARS-CoV-2 tutorial for more details on defining subsampling schemes](https://docs.nextstrain.org/en/latest/tutorials/SARS-CoV-2/steps/customizing-analysis.html#subsampling).

diff --git a/scripts/adjust_regional_meta.py b/scripts/adjust_regional_meta.py
@@ -41,11 +41,17 @@
     metadata.loc[metadata.region != focal_region, 'location'] = ""
     metadata.loc[metadata.region != focal_region, 'division'] = metadata.region
     metadata.loc[metadata.region != focal_region, 'country'] = metadata.region
-    metadata.loc[metadata.region != focal_region, 'division_exposure'] = metadata.region_exposure
-    metadata.loc[metadata.region != focal_region, 'country_exposure'] = metadata.region_exposure
-    metadata.loc[(metadata.region == focal_region) & (metadata.region_exposure != focal_region), 'division_exposure'] = metadata.region_exposure
-    metadata.loc[(metadata.region == focal_region) & (metadata.region_exposure != focal_region), 'country_exposure'] = metadata.region_exposure
-    metadata.loc[(metadata.region == focal_region) & (metadata.division_exposure.isna()), 'division_exposure'] = metadata.division
-    metadata.loc[(metadata.region == focal_region) & (metadata.country_exposure.isna()), 'country_exposure'] = metadata.country
+
+    if "region_exposure" in metadata.columns:
+        metadata.loc[metadata.region != focal_region, 'division_exposure'] = metadata.region_exposure
+        metadata.loc[metadata.region != focal_region, 'country_exposure'] = metadata.region_exposure
+        metadata.loc[(metadata.region == focal_region) & (metadata.region_exposure != focal_region), 'division_exposure'] = metadata.region_exposure
+        metadata.loc[(metadata.region == focal_region) & (metadata.region_exposure != focal_region), 'country_exposure'] = metadata.region_exposure
+
+    if "division_exposure" in metadata.columns:
+        metadata.loc[(metadata.region == focal_region) & (metadata.division_exposure.isna()), 'division_exposure'] = metadata.division
+
+    if "country_exposure" in metadata.columns:
+        metadata.loc[(metadata.region == focal_region) & (metadata.country_exposure.isna()), 'country_exposure'] = metadata.country
 
     metadata.to_csv(args.output, index=False, sep="\t")
diff --git a/scripts/combine-and-dedup-fastas.py b/scripts/combine-and-dedup-fastas.py