Skip to content
Permalink
Browse files

Merge pull request #268 from nextstrain/split-alignments

Align sequences in smaller partitions to speed up builds
  • Loading branch information
trvrb committed Mar 25, 2020
2 parents 042790b + 2d75959 commit b074bc19276878ae4cb5359d9e6409729b45b31f
Showing with 72 additions and 4 deletions.
  1. +36 −4 Snakefile
  2. +36 −0 scripts/partition-sequences.py
@@ -79,28 +79,60 @@ rule filter:
--output {output.sequences}
"""

checkpoint partition_sequences:
input:
sequences = rules.filter.output.sequences
output:
split_sequences = directory("results/split_sequences")
params:
sequences_per_group = 150
shell:
"""
python3 scripts/partition-sequences.py \
--sequences {input.sequences} \
--sequences-per-group {params.sequences_per_group} \
--output-dir {output.split_sequences}
"""

rule align:
message:
"""
Aligning sequences to {input.reference}
- gaps relative to reference are considered real
"""
input:
sequences = "results/filtered.fasta",
sequences = "results/split_sequences/{i}.fasta",
reference = files.reference
output:
alignment = "results/aligned.fasta"
alignment = "results/split_alignments/{i}.fasta"
threads: 2
shell:
"""
augur align \
--sequences {input.sequences} \
--reference-sequence {input.reference} \
--output {output.alignment} \
--nthreads auto \
--nthreads {threads} \
--remove-reference \
--fill-gaps
"""

def _get_alignments(wildcards):
checkpoint_output = checkpoints.partition_sequences.get(**wildcards).output[0]
return expand("results/split_alignments/{i}.fasta",
i=glob_wildcards(os.path.join(checkpoint_output, "{i}.fasta")).i)

rule aggregate_alignments:
message: "Collecting alignments"
input:
alignments = _get_alignments
output:
alignment = "results/aligned.fasta"
shell:
"""
cat {input.alignments} > {output.alignment}
"""

rule mask:
message:
"""
@@ -110,7 +142,7 @@ rule mask:
- masking other sites: {params.mask_sites}
"""
input:
alignment = rules.align.output.alignment
alignment = rules.aggregate_alignments.output.alignment
output:
alignment = "results/masked.fasta"
params:
@@ -0,0 +1,36 @@
"""Split sequences for multiple sequence alignment into smaller chunks to speed up alignment.
"""
import argparse
from augur.align import read_sequences
from Bio import SeqIO
from pathlib import Path


if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--sequences", required=True, nargs="+", help="FASTA file of sequences to partition into smaller chunks")
parser.add_argument("--sequences-per-group", required=True, type=int, help="number of sequences to include in each group")
parser.add_argument("--output-dir", required=True, help="directory to write out partitioned sequences")

args = parser.parse_args()

# Read sequences with augur to benefit from additional checks for duplicates.
sequences = list(read_sequences(*args.sequences).values())

# Create the requested output directory.
output_dir = Path(args.output_dir)
output_dir.mkdir(exist_ok=True)

# Determine partition indices.
indices = list(range(0, len(sequences), args.sequences_per_group))

# Add a final index to represent the end of the last partition.
if indices[-1] != len(sequences):
indices.append(len(sequences))

# Partition sequences into groups of no more than the requested number.
for i in range(len(indices) - 1):
# Save partitioned sequences to a new FASTA file named after the partition number.
print("Write out %i sequences for partition %i" % (indices[i + 1] - indices[i], i))
output_path = output_dir / Path("%i.fasta" % i)
SeqIO.write(sequences[indices[i]:indices[i + 1]], output_path, 'fasta')

0 comments on commit b074bc1

Please sign in to comment.
You can’t perform that action at this time.