Skip to content

Commit

Permalink
ID strain by accession
Browse files Browse the repository at this point in the history
  • Loading branch information
j23414 committed Oct 11, 2023
1 parent 81a072a commit 8ab810f
Show file tree
Hide file tree
Showing 8 changed files with 80 additions and 5 deletions.
45 changes: 40 additions & 5 deletions Snakefile
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
if not config:
configfile: "config/config_dengue.yaml"

serotypes = ['all', 'denv1', 'denv2', 'denv3', 'denv4']

rule all:
Expand Down Expand Up @@ -98,12 +101,14 @@ rule filter:
params:
group_by = "year region",
sequences_per_group = filter_sequences_per_group,
min_length = 5000
min_length = 5000,
strain_id = config.get("strain_id_field", "strain"),
shell:
"""
augur filter \
--sequences {input.sequences} \
--metadata {input.metadata} \
--metadata-id-columns {params.strain_id} \
--exclude {input.exclude} \
--output {output.sequences} \
--group-by {params.group_by} \
Expand Down Expand Up @@ -161,17 +166,19 @@ rule refine:
metadata = "results/metadata_{serotype}.tsv"
output:
tree = "results/tree_{serotype}.nwk",
node_data = "results/branch-lengths_{serotype}.json"
node_data = "results/branch-lengths_{serotype}.json",
params:
coalescent = "const",
date_inference = "marginal",
clock_filter_iqd = 4
clock_filter_iqd = 4,
strain_id = config.get("strain_id_field", "strain"),
shell:
"""
augur refine \
--tree {input.tree} \
--alignment {input.alignment} \
--metadata {input.metadata} \
--metadata-id-columns {params.strain_id} \
--output-tree {output.tree} \
--output-node-data {output.node_data} \
--timetree \
Expand Down Expand Up @@ -228,12 +235,14 @@ rule traits:
node_data = "results/traits_{serotype}.json",
params:
columns = traits_columns,
sampling_bias_correction = 3
sampling_bias_correction = 3,
strain_id = config.get("strain_id_field", "strain"),
shell:
"""
augur traits \
--tree {input.tree} \
--metadata {input.metadata} \
--metadata-id-columns {params.strain_id} \
--output {output.node_data} \
--columns {params.columns} \
--confidence \
Expand Down Expand Up @@ -270,18 +279,44 @@ rule export:
aa_muts = "results/aa-muts_{serotype}.json",
auspice_config = files.auspice_config
output:
auspice_json = "auspice/dengue_{serotype}.json"
auspice_json = "results/raw_dengue_{serotype}.json",
root_sequence = "results/raw_dengue_{serotype}_root-sequence.json",
params:
strain_id = config.get("strain_id_field", "strain"),
shell:
"""
augur export v2 \
--tree {input.tree} \
--metadata {input.metadata} \
--metadata-id-columns {params.strain_id} \
--node-data {input.branch_lengths} {input.traits} {input.clades} {input.nt_muts} {input.aa_muts} \
--auspice-config {input.auspice_config} \
--include-root-sequence \
--output {output.auspice_json}
"""

rule final_strain_name:
input:
auspice_json="results/raw_dengue_{serotype}.json",
metadata="results/metadata_{serotype}.tsv",
root_sequence="results/raw_dengue_{serotype}_root-sequence.json",
output:
auspice_json="auspice/dengue_{serotype}.json",
root_sequence="auspice/dengue_{serotype}_root-sequence.json",
params:
strain_id=config.get("strain_id_field", "strain"),
display_strain_field=config.get("display_strain_field", "strain"),
shell:
"""
python3 bin/set_final_strain_name.py \
--metadata {input.metadata} \
--metadata-id-columns {params.strain_id} \
--input-auspice-json {input.auspice_json} \
--display-strain-name {params.display_strain_field} \
--output {output.auspice_json}
cp {input.root_sequence} {output.root_sequence}
"""

rule clean:
"""Removing directories: {params}"""
params:
Expand Down
38 changes: 38 additions & 0 deletions bin/set_final_strain_name.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
import pandas as pd
import json, argparse
from augur.io import read_metadata

def replace_name_recursive(node, lookup):
if node["name"] in lookup:
node["name"] = lookup[node["name"]]

if "children" in node:
for child in node["children"]:
replace_name_recursive(child, lookup)

if __name__=="__main__":
parser = argparse.ArgumentParser(
description="Swaps out the strain names in the Auspice JSON with the final strain name",
formatter_class=argparse.ArgumentDefaultsHelpFormatter
)

parser.add_argument('--input-auspice-json', type=str, required=True, help="input auspice_json")
parser.add_argument('--metadata', type=str, required=True, help="input data")
parser.add_argument('--metadata-id-columns', nargs="+", help="names of possible metadata columns containing identifier information, ordered by priority. Only one ID column will be inferred.")
parser.add_argument('--display-strain-name', type=str, required=True, help="field to use as strain name in auspice")
parser.add_argument('--output', type=str, metavar="JSON", required=True, help="output Auspice JSON")
args = parser.parse_args()

metadata = read_metadata(args.metadata, id_columns=args.metadata_id_columns)
name_lookup = {}
for ri, row in metadata.iterrows():
strain_id = row.name
name_lookup[strain_id] = args.display_strain_name if pd.isna(row[args.display_strain_name]) else row[args.display_strain_name]

with open(args.input_auspice_json, 'r') as fh:
data = json.load(fh)

replace_name_recursive(data['tree'], name_lookup)

with open(args.output, 'w') as fh:
json.dump(data, fh)
2 changes: 2 additions & 0 deletions config/config_dengue.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
strain_id_field: "accession"
display_strain_field: "strain"
Binary file modified example_data/sequences_all.fasta.zst
Binary file not shown.
Binary file modified example_data/sequences_denv1.fasta.zst
Binary file not shown.
Binary file modified example_data/sequences_denv2.fasta.zst
Binary file not shown.
Binary file modified example_data/sequences_denv3.fasta.zst
Binary file not shown.
Binary file modified example_data/sequences_denv4.fasta.zst
Binary file not shown.

0 comments on commit 8ab810f

Please sign in to comment.