## CANCELLED - see 20180223 for amplicon extraction


## Extract potential amplicons from MAF alignment
* Iterate over chromosomal MAF alignments
* Find conserved parts of an alignment using sliding window approach

Parameter for alignment extraction are listed below.

In [1]:
# check biopython version - need >=1.69 to support MAF
import Bio
Bio.__version__

'1.70'

In [21]:
from Bio import AlignIO
import numpy as np

In [30]:
# Alignment selection parameters
min_species = 21 # minimum number of species in alignment
min_aligned = 50 # minimum alignment length

# Alignment filtering parameters
min_conserved = 50 # minimum length of region with given conservation level
max_xs = 0.1 # maximum proportion of indels (represented as X) in alignments
max_ns = 0.1 # maximum proportion of substitutions (represented as N) in alignments

def seq_repr(alignment):
    '''
    Given multiple sequence alignment, return first sequence with Ns for ambiguous chars and X's for indels.'''
    seq = ''
    for i in range(alignment.get_alignment_length()):
        col = alignment[:, i]
        if '-' in col: # indel stronger than substitution
            seq += 'X'
        elif len(set(col)) == 1:
            seq += col[0]
        else:
            seq += 'N'
    return seq

def get_conserved_subsequences(seq, max_ns=0.1, max_xs=0.1, min_len=100):
    '''
    Given sequence, conservation (max_ns) and indel (max_xs) levels, and minimum subsequence length
    return list of tuples for the subsequences with given conservation level (overlapping regions merged).
    If no conserved subsequences found, return 'None'.'''
    slen = len(seq)
    if slen < min_len:
        return None
    
    def is_conserved(s, max_ns, max_xs):
        if s.count('N')/len(s) <= max_ns and s.count('X')/len(s) <= max_xs:
            return True
        else:
            return False
    cons_windows = [is_conserved(seq[i:i + min_len], max_ns, max_xs) for i in range(slen - min_len + 1)]
    if sum(cons_windows) == 0:
        return None
    
    cons_kernels = []
    in_kernel = False
    for i, cw in enumerate(cons_windows):
        if in_kernel:
            if cw == False:
                in_kernel = False
                cons_kernels.append(i + min_len)
        elif cw == True:
            cons_kernels.append(i)
            in_kernel = True
    if in_kernel:
        cons_kernels.append(i + min_len) 
        
    # merge overlapping kernels
    merged_kernels = []
    for i in range(len(cons_kernels)//2):
        start = cons_kernels[i * 2]
        end = cons_kernels[i * 2 + 1]
        if not merged_kernels:
            merged_kernels.append((start, end))
        else:
            prev_start = merged_kernels[-1][0]
            prev_end = merged_kernels[-1][1]
            if prev_end >= start:
                upper_bound = max(prev_end, end)
                merged_kernels[-1] = (prev_start, upper_bound)  # replace by merged interval
            else:
                merged_kernels.append((start, end))
    
    return np.asarray(merged_kernels)

def gapped_coord(aln, coord, ref=0):
    '''
    Transforms coordinate in maf alignment according to number of gaps in ref (i-th seq in alignment)
    '''
    ngaps = str(aln[ref, :coord].seq).count('-')
    return aln[ref].annotations['start'] + coord - ngaps

def alignment_filter(alignment, min_species, min_aligned, max_xs, max_ns, 
                           min_conserved, annotated=True):
    '''
    Given alignment and filtering paramters
    return list of matching subalignments 
    '''
    tgt_data = []
    if len(alignment) >= min_species and alignment.get_alignment_length() >= min_aligned:
        seq = seq_repr(alignment)
        cons = get_conserved_subsequences(seq, max_ns=max_ns, max_xs=max_xs, min_len=min_conserved)
        if cons is not None:
            for tgt in cons:
                tgt_aln = alignment[:, tgt[0]:tgt[1]]
                if annotated:
                    tgt_aln[0].annotations = alignment[0].annotations.copy()
                    tgt_aln[0].annotations['start'] = gapped_coord(alignment, tgt[0])
                    tgt_aln[0].annotations['size'] = gapped_coord(alignment, tgt[1]) - tgt_aln[0].annotations['start']
                
                tgt_data.append(tgt_aln)
    return tgt_data

In [33]:
import sys



agam_size = 0
aln_size = 0
with open("/Users/am60/malaria/20180129_phylo_ampliseq/21sp_50_50_01_01.maf", "w") as maf_file, \
     open("/Users/am60/malaria/20180129_phylo_ampliseq/21sp_50_50_01_01.fa", "w") as fasta_file:
    for chrom in ['2L','2R','3L','3R','X']:
        for alignment in AlignIO.parse("/Users/am60/data/AgamP3_maf/chr{}.maf".format(chrom), "maf"):
            alns = alignment_filter(alignment, min_species, min_aligned, max_xs, max_ns, 
                                   min_conserved)
            if len(alns) > 0:
                for aln in alns:
                    agam_size += aln[0].annotations['size']
                    aln_size += aln.get_alignment_length()
                    AlignIO.write(aln, maf_file, "maf")
                    fasta_file.write('>{}_{}_{}\n{}\n'.format(
                            aln[0].id,
                            aln[0].annotations['start'],
                            aln[0].annotations['size'],
                            aln[0].seq))
print(agam_size, aln_size)

335852 348308


In [17]:
for alignment in AlignIO.parse("/Users/am60//data/AgamP3_maf/chr2L.maf", "maf"):
    print(alignment[0].seq)
    break

NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN