## Extract potential amplicons from MAF alignment
* Iterate over chromosomal MAF alignments
* Find conserved parts of an alignment using sliding window approach
* Amplicons can also span non-conserved regions, these are further referenced as long
* Write amplicon MAFs and target coordinates as TSV

Parameter for alignment extraction are listed below.

## Next step - 20180309 - extract phylogenetic information for amplicons

In [18]:
from Bio import AlignIO
import numpy as np

In [38]:
# Alignment filtering parameters
min_species = 21 # minimum number of species in alignment
min_aligned = 190 # minimum alignment length, also used as minimum amplicon length

min_conserved = 50 # minimum length of flanks with given conservation level - used for primer design and, if possible, target sites
max_xs = 0.1 # maximum proportion of indels (represented as X) in flanks
max_ns = 0.1 # maximum proportion of substitutions (represented as N) in flanks

max_insert = 100 # maximum length of non-conserved sequence between two conserved flanks 

def seq_repr(alignment):
    '''
    Given multiple sequence alignment, return first sequence with Ns for ambiguous chars and X's for indels.'''
    seq = ''
    for i in range(alignment.get_alignment_length()):
        col = alignment[:, i]
        if '-' in col: # indel stronger than substitution
            seq += 'X'
        elif len(set(col)) == 1:
            seq += col[0]
        else:
            seq += 'N'
    return seq

def get_conserved_subsequences(seq, max_ns=0.1, max_xs=0.1, min_len=100):
    '''
    Given sequence, conservation (max_ns) and indel (max_xs) levels, and minimum subsequence length
    return list of tuples for the subsequences with given conservation level (overlapping regions merged).
    If no conserved subsequences found, return 'None'.'''
    slen = len(seq)
    if slen < min_len:
        return None
    
    def is_conserved(s, max_ns, max_xs):
        if s.count('N')/len(s) <= max_ns and s.count('X')/len(s) <= max_xs:
            return True
        else:
            return False
    cons_windows = [is_conserved(seq[i:i + min_len], max_ns, max_xs) for i in range(slen - min_len + 1)]
    if sum(cons_windows) == 0:
        return None
    
    cons_kernels = []
    in_kernel = False
    for i, cw in enumerate(cons_windows):
        if in_kernel:
            if cw == False:
                in_kernel = False
                cons_kernels.append(i + min_len)
        elif cw == True:
            cons_kernels.append(i)
            in_kernel = True
    if in_kernel:
        cons_kernels.append(i + min_len) 
        
    # merge overlapping kernels
    merged_kernels = []
    for i in range(len(cons_kernels)//2):
        start = cons_kernels[i * 2]
        end = cons_kernels[i * 2 + 1]
        if not merged_kernels:
            merged_kernels.append((start, end))
        else:
            prev_start = merged_kernels[-1][0]
            prev_end = merged_kernels[-1][1]
            if prev_end >= start:
                upper_bound = max(prev_end, end)
                merged_kernels[-1] = (prev_start, upper_bound)  # replace by merged interval
            else:
                merged_kernels.append((start, end))
    
    return np.asarray(merged_kernels)

# functions test
# for alignment in AlignIO.parse("../../data/AgamP3_maf/chr2L.maf", "maf"):
#     if len(alignment) >= min_species and alignment.get_alignment_length() >= min_aligned:
#         seq = seq_repr(alignment)
#         cons = get_conserved_subsequences(seq, max_ns=max_ns, max_xs=max_xs, min_len=min_conserved)
#         if cons is not None: # conser
#             print(seq)
#             print(cons, cons[:,1] - cons[:,0])
#             break

XXXXXXXAGANANCTTATAATTAGCGNNTAATTTACNTTCGATNAATTGGCAGGGGCAAGCTNCGNNAANNAATCNTCNNNTTTATGNGGATTNNANNNATTCCANNGNNNCNGNNTCGGTAAGTACGACNCCNNNNTCNANNNNNCNNGNNGANNNNNNNGANACNATNAANACTACNACNCCNNANGNAANNTNTGGGANTGNTGCTGACAGATGNTATGCATNNGNNCNNCTNCCGNNCANCNNGANNCNNTNTGNAANCGNTANCNNACNTTNCGNNNNCNNGGNNANXXXXXXNNNNGGNNTNNGNCCNAGNNTNGAGATGNGNNTNGCNNTNAANCACGNNATNCTNGNNGACGAAGANCTNNTCNCNTACAGNNCNGGNCCNGANCTGACNNNNATTCTNGGNCNCGACCTCTCCANATACCANCNNATGANNGNGAANGANNTNATNNTGAANCNNATNGTNACNNGNNTGANCANNNXXXXXXANNTNCANNXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXNXXXXXXXXXXXXXXXXXXNCNNNNNNNNNNNXXXXXXXXXXXXXXNNNXXXXXXXXXXXXNCNANNNNNNNNNTGANAANCGNAANGAGCTCATNATATCANCAAAACAATTCAAAGATGGATACNCCTATNNNNNANCGNNGANNNNNNNNCGNNCXXXNAGNTANNTGGANCANTTNTGGNTNTGNGGATCGNGGTNXXXXXXX
[[ 11  66]
 [744 796]] [55 52]


In [72]:
# Candidate amplicon search - within conserved sequences and between consecutive conserved sequences
def get_candidate_amplicons(cons, min_len=190, max_insert=50):
    '''
    Given conservation intervals, minimum amplicon lenght and maximum insert length,
    return np.array of plausible amplicons with insert positions'''
    ampls = []
    for reg in cons: # internal amplicons
        if reg[1] - reg[0] >= min_len:
            ampls.append((reg[0], reg[1], 0, 0))
    for i in range(len(cons) - 1):
        for j in range(i + 1, len(cons)):
            if cons[j, 0] - cons[i, 1] <= max_insert:
                if cons[j, 1] - cons[i, 0] >= min_len:
                    ampls.append((cons[i, 0], cons[j, 1],
                                  cons[i, 1], cons[j, 0]))
    return ampls

# function test - long run
# for alignment in AlignIO.parse("../../data/AgamP3_maf/chr2L.maf", "maf"):
#     if len(alignment) >= min_species and alignment.get_alignment_length() >= min_aligned:
#         seq = seq_repr(alignment)
#         cons = get_conserved_subsequences(seq, max_ns=max_ns, max_xs=max_xs, min_len=min_conserved)
#         if cons is not None:
#             ampls = get_candidate_amplicons(cons, min_aligned, max_insert)
#             if len(ampls) > 0:
#                 for reg in cons:
#                     print(reg, seq[reg[0]:reg[1]])
#                 for ampl in ampls:
#                     print(alignment[:, ampl[0]:ampl[1]])
#                 print(ampls)
#                 break

[110 227] TCNTTCATGTACCANGAACGCGCCATGACCGAGGGNTGNGCCTCNCCACCACCNGCNGCGGTACTGAGACTTCACAATCAATCNCCATACTACAATATNGTGCGTTTNNXTTNCTTN
[1613 1926] AGNNGNACCGANTCGTCCAANCAGCTGGACGCGAAGATGAAGCACAGTAAGGANCTGGACNNGXXXGNNGCNGGNGGXXXCCCNGGNGACCACGGCTATAACCCGAACCACTATATGCGTTCNATTCCAGGTCTNATNTATCACGGCACGCAATCGAGCACNTCNTCCGATTTNTCGCCGATGTCCGAACAAAAGTCNTTACCNCGGCGTGGCCGTTCAAGGTACCATCATCTTCANCTTCATANCACTAACACCACACCANGNCACAAGNCCTCCAAAGCTCAGTCACCANTAGCGTCNCCCCGTAGTAGNN
[1941 2018] XXNACNAXXXNAACGTNCCGATACAGCTTCCCCGCATGCCGTCCCAGTTTCGNCCNATNCACAGTAGTNGAACNATN
SingleLetterAlphabet() alignment with 21 rows and 313 columns
AGCAGAACCGAATCGTCCAATCAGCTGGACGCGAAGATGAAGCA...GTT AgamP3.chr2L
AGCAGAACCGAATCGTCCAATCAGCTGGACGCGAAGATGAAGCA...GTT AgamS1.chrscf_1106392397111
AGCAGAACCGAATCGTCCAATCAGCTGGACGCGAAGATGAAGCA...GTT AgamM1.chrscf_1925491356
AGCAGAACCGAATCGTCCAATCAGCTGGACGCGAAGATGAAGCA...GTT AmerM1.chrsupercont1.338
AGCAGAACCGAATCGTCCAATCAGCTGGACGCGAAGATGAAGCA...GTT AaraD1.chrKB704529
AGCAGAACCGAATCGTCCAATCAGCTGGA

In [86]:
def gapped_coord(aln, coord, ref=0):
    '''
    Transforms coordinate in maf alignment according to number of gaps in ref (i-th seq in alignment)
    '''
    ngaps = str(aln[ref, :coord].seq).count('-')
    return aln[ref].annotations['start'] + coord - ngaps

def alignment_to_amplicons(alignment, min_species, min_aligned, max_xs, max_ns, min_conserved, max_insert, annotated=True):
    '''
    Given alignment and filtering paramters
    return list of (alignment, target start, target end)
    '''
    ampl_data = []
    if len(alignment) >= min_species and alignment.get_alignment_length() >= min_aligned:
        seq = seq_repr(alignment)
        cons = get_conserved_subsequences(seq, max_ns=max_ns, max_xs=max_xs, min_len=min_conserved)
        if cons is not None:
            ampls = get_candidate_amplicons(cons, min_aligned, max_insert)
            if len(ampls) > 0:
                for ampl in ampls:
                    ampl_aln = alignment[:, ampl[0]:ampl[1]]
                    if annotated:
                        ampl_aln[0].annotations = alignment[0].annotations.copy()
                        ampl_aln[0].annotations['start'] = gapped_coord(alignment, ampl[0])
                        ampl_aln[0].annotations['size'] = gapped_coord(alignment, ampl[1]) - ampl_aln[0].annotations['start']
                    ampl_data.append((ampl_aln, (ampl[2] - ampl[0], ampl[3] - ampl[0])))
                return ampl_data
    return None
                    
# function test - long run
# amplicons = []
# for alignment in AlignIO.parse("../../data/AgamP3_maf/chr2L.maf", "maf"):
#     a = alignment_to_amplicons(alignment, min_species, min_aligned, max_xs, max_ns, min_conserved, max_insert)
#     if a is not None:
#         amplicons.extend(a)
# #         print(amplicons)
# #         break

# print(len(amplicons))

26


In [88]:
# exctract amplicons from all chromosomes
chromosomes = ['2L','2R','3L','3R','X']
chrom_files = ["../../data/AgamP3_maf/chr" + chrom + ".maf" for chrom in chromosomes]

amplicons = []

# this actually takes some time
for chrom_file in chrom_files:
    for alignment in AlignIO.parse(chrom_file, "maf"):
        a = alignment_to_amplicons(alignment, min_species, min_aligned, max_xs, max_ns, min_conserved, max_insert)
        if a is not None:
            amplicons.extend(a)
print(len(amplicons))

126


In [89]:
%pwd

'/Users/am60/malaria/20180129_phylo_ampliseq'

In [103]:
# write amplicons
count = 0
with open("../20180129_phylo_ampliseq/20180226_amplicons.maf", "w") as handle:
    for a in amplicons:
        count += AlignIO.write(a[0], handle, "maf")
count

126

In [105]:
# write targets
with open("../20180129_phylo_ampliseq/20180226_targets.txt", 'w') as handle:
    for a in amplicons:
        handle.write('\t'.join([str(c) for c in a[1]]) + '\n')

In [106]:
!grep AgamP3.chr 20180226_amplicons.maf | cut -d ' ' -f2 | sort | uniq -c
 

  26 AgamP3.chr2L
  49 AgamP3.chr2R
  17 AgamP3.chr3L
  33 AgamP3.chr3R
   1 AgamP3.chrX


# TODO

- target search: clusters of N/Xs + 20180226_targets.txt
- target search: use DistanceCalculator on concatenated targets to select minimum set of phylogenetically informative markers
- apart from prospective amplicons and targets, produce reports and trees per amplicon

# Other amplicon-related developments

- search amplicons and targets in mtDNA data
- same for 16S, but also consider species ID PCR
- limited taxonomic primer pick from maf's (e.g., species marker for gambiae complex)
- incorporation of population data - variable sites evaluated by frequency/geographic associations
- Chris Kozak CRISPR target design script should be similar to population examination (look in Ag1000g notebooks)
