Permalink
Browse files

Enable masking of sites in alignments prior to tree building

Uses the existing interface for defining excluded sites in a VCF-based analysis
to also enable masking of sites from a multiple sequence alignment prior to
building a tree. This includes an initial refactor of the logic to load excluded
sites from a text file, but this refactor is not yet complete.

Users expect to define excluded sites in one-based coordinates when not
providing a BED file, so this commit also enforces this expectation. Excluded
sites are converted to zero-based positions internally regardless of the input
format for consistency with other Python tools that expect zero-based indexing.
  • Loading branch information...
huddlej committed Jan 26, 2019
1 parent ac2e7b4 commit 1df7a80c0fd761b894650640299fa76c4549611b
Showing with 113 additions and 19 deletions.
  1. +113 −19 augur/tree.py
@@ -7,9 +7,12 @@
import sys
import time
import uuid
import Bio
from Bio import Phylo
from treetime.vcf_utils import read_vcf
import numpy as np
from treetime.vcf_utils import read_vcf
from pathlib import Path

from .utils import run_shell_command, nthreads_value

def find_executable(names, default = None):
@@ -196,6 +199,54 @@ def build_iqtree(aln_file, out_file, substitution_model="GTR", clean_up=True, nt
return T


def load_excluded_sites(excluded_sites_file):
"""Returns an array of zero-based sites to exclude from a FASTA prior to tree building.
Parameters
----------
excluded_sites_file : str
a path to a BED file (with a .bed extension), a tab-delimited DRM file, or a plain text file with one position per line
Returns
-------
ndarray :
a unique array of positions loaded from the given file
"""
strip_pos = []
is_bed_format = False

if excluded_sites_file is not None:
# Check for BED file extension.
if excluded_sites_file.lower().endswith('.bed'):
import pandas as pd
is_bed_format = True
bed = pd.read_csv(excluded_sites_file, sep='\t')
for index, row in bed.iterrows():
strip_pos.extend(list(range(row[1], row[2]+1)))
else:
# Next, check for DRM-file format or site-per-line format.
with open(excluded_sites_file, 'r') as ifile:
line1 = ifile.readline()
# If the file is tab-delimited, assume it is in DRM-file format.
if '\t' in line1:
strip_pos = [int(line.strip().split('\t')[1]) for line in ifile]
else:
# Finally, fall back to site-per-line format.
strip_pos = [int(line.strip()) for line in ifile]
if line1.strip():
# Add the first line back to the list
strip_pos.append(int(line1.strip()))

strip_pos = np.unique(strip_pos)

# If the given sites are not in BED format, they are one-based positions and
# need to be internally adjusted to zero-based positions.
if not is_bed_format:
strip_pos = strip_pos - 1

return strip_pos


def write_out_informative_fasta(compress_seq, alignment, stripFile=None):
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
@@ -206,23 +257,8 @@ def write_out_informative_fasta(compress_seq, alignment, stripFile=None):
positions = compress_seq['positions']

#If want to exclude sites from initial treebuild, read in here
#IF FIND STANDARDIZED DRM FILE FORMAT, IMPLEMENT HERE
strip_pos = []
if stripFile:
if stripFile.lower().endswith('.bed'): #BED format file
import pandas as pd
bed = pd.read_csv(stripFile, sep='\t')
for index, row in bed.iterrows():
strip_pos.extend(list(range(row[1], row[2]+1)))
else: #site-per-line format or DRM-file format
with open(stripFile, 'r') as ifile:
line1 = ifile.readline()
if '\t' in line1: #DRM-file format
strip_pos = [int(line.strip().split('\t')[1]) for line in ifile]
else: #site-per-line
strip_pos = [int(line.strip()) for line in ifile]
strip_pos.append(int(line1.strip())) #add back 1st line
strip_pos = np.unique(strip_pos)
strip_pos = load_excluded_sites(stripFile)

#Get sequence names
seqNames = list(sequences.keys())

@@ -269,6 +305,58 @@ def write_out_informative_fasta(compress_seq, alignment, stripFile=None):
return fasta_file


def mask_sites_in_multiple_sequence_alignment(alignment_file, excluded_sites_file):
"""Creates a new multiple sequence alignment FASTA file from which the given
excluded sites have been removed and returns the filename of the new
alignment.
Parameters
----------
alignment_file : str
path to the original multiple sequence alignment file
excluded_sites_file : str
path to a text file containing each nucleotide position to exclude with one position per line
Returns
-------
str
path to the new FASTA file from which sites have been excluded
"""
# Load alignment.
alignment = Bio.AlignIO.read(alignment_file, "fasta")

# Load zero-based excluded sites.
excluded_sites = load_excluded_sites(excluded_sites_file)

# Return the original alignment file, if no excluded sites were found.
if len(excluded_sites) == 0:
return alignment_file

# Find sites to include in the final alignment.
sites = np.arange(alignment.get_alignment_length())
included_sites = np.setdiff1d(sites, excluded_sites)

# Build the final alignment by slicing.
alignment_sites = [alignment[:, site:site+1] for site in included_sites]
final_alignment = alignment_sites[0]
for alignment_site in alignment_sites[1:]:
final_alignment += alignment_site

# Confirm that the final alignment is shorter than the original, if there
# were sites to exclude.
assert len(excluded_sites) == 0 or (final_alignment.get_alignment_length() < alignment.get_alignment_length())

# Write out the new alignment FASTA to disk.
alignment_file_path = Path(alignment_file)
masked_alignment_file = str(alignment_file_path.parent / f"masked_{alignment_file_path.name}")
with open(masked_alignment_file, "w") as oh:
Bio.AlignIO.write(final_alignment, oh, "fasta")

# Return the new alignment FASTA filename.
return masked_alignment_file


def register_arguments(parser):
parser.add_argument('--alignment', '-a', required=True, help="alignment in fasta or VCF format")
parser.add_argument('--method', default='iqtree', choices=["fasttree", "raxml", "iqtree"], help="tree builder to use")
@@ -278,14 +366,16 @@ def register_arguments(parser):
parser.add_argument('--nthreads', type=nthreads_value, default=1,
help="number of threads to use; specifying the value 'auto' will cause the number of available CPU cores on your system, if determinable, to be used")
parser.add_argument('--vcf-reference', type=str, help='fasta file of the sequence the VCF was mapped to')
parser.add_argument('--exclude-sites', type=str, help='file name of sites to exclude for raw tree building (VCF only)')
parser.add_argument('--exclude-sites', type=str, help='file name of one-based sites to exclude for raw tree building (BED format in .bed files, DRM format in tab-delimited files, or one position per line)')


def run(args):
# check alignment type, set flags, read in if VCF
is_vcf = False
ref = None
if any([args.alignment.lower().endswith(x) for x in ['.vcf', '.vcf.gz']]):
# Prepare a multiple sequence alignment from the given variants VCF and
# reference FASTA.
if not args.vcf_reference:
print("ERROR: a reference Fasta is required with VCF-format alignments")
return 1
@@ -294,7 +384,11 @@ def run(args):
ref = compress_seq['reference']
is_vcf = True
aln = sequences
elif args.exclude_sites:
# Mask excluded sites from the given multiple sequence alignment.
aln = mask_sites_in_multiple_sequence_alignment(args.alignment, args.exclude_sites)
else:
# Use the multiple sequence alignment as is.
aln = args.alignment

start = time.time()

0 comments on commit 1df7a80

Please sign in to comment.