## CANCELLED - see 20180223 for amplicon extraction

In [3]:
from Bio import AlignIO
from collections import OrderedDict

In [7]:
# parameters for filtering the alignment
min_species = 21
min_aligned = 190

def snp_count(alignment, start=0, end=None):
    # list variants using coordinates in seq1
    if end == None:
        end = alignment.get_alignment_length()
    snps_data = []
    chrom = alignment[0].id
    aln_start = alignment[0].annotations['start'] + 1 # convert to 1-based for compartibility with genome browsers
    ngaps = 0
    
    for i in range(alignment.get_alignment_length()):
        col = alignment[:, i]
        if col[0] == '-':
            ngaps += 1
        snps = set(col)
        if len(snps) > 1 and i in range(start, end): # variable position within region of interest
            ref = col[0]
            snp_data = '%s\t%d\t%s' % (chrom, aln_start + i - ngaps, ref)
            snps.remove(ref)
            alt_data = []
            for snp in snps:
                alt_data += [snp, col.count(snp)]
            snp_data += '\t%s\t%d\t%s' % (','.join(alt_data[0::2]), col.count(ref), \
                                          ','.join([str(i) for i in alt_data[1::2]]))
            snps_data.append(snp_data)
    return snps_data

# test output for a single good position
for alignment in AlignIO.parse("../../data/AgamP3_maf/chr2L.maf", "maf"):
    if len(alignment) >= min_species and alignment.get_alignment_length() >= min_aligned:
        print('Alignment of %i sequences, total length %i' % (len(alignment), alignment.get_alignment_length()))
        print(alignment[0].seq)
        print(snp_count(alignment))
        break

Alignment of 21 sequences, total length 198
CTTTCTTTTCGCCAACCTCATCGTGACGATCGTGGTGCTCGGTGGACTGTTGGCAACGATCGAAAAGCATCTGCCTACTGCCATCCGGCAGACGTTCCGGTATGACAAGCATGCACTGAAGGGATCATCG---GTCCGATTGGTGTCCCTGCTGGAGATCCTGAAGGCA-----CACACATT--CACAAACAAACATT
['AgamP3.chr2L\t71583\tC\tT\t18\t3', 'AgamP3.chr2L\t71586\tT\tC\t18\t3', 'AgamP3.chr2L\t71587\tC\tT\t18\t3', 'AgamP3.chr2L\t71589\tT\tA,G\t5\t5,11', 'AgamP3.chr2L\t71590\tT\tC\t19\t2', 'AgamP3.chr2L\t71593\tG\tA\t20\t1', 'AgamP3.chr2L\t71594\tC\tT\t4\t17', 'AgamP3.chr2L\t71595\tC\tG\t18\t3', 'AgamP3.chr2L\t71598\tC\tT\t19\t2', 'AgamP3.chr2L\t71601\tC\tG\t7\t14', 'AgamP3.chr2L\t71604\tC\tT\t20\t1', 'AgamP3.chr2L\t71605\tG\tA,T\t17\t2,2', 'AgamP3.chr2L\t71607\tG\tA,C\t14\t5,2', 'AgamP3.chr2L\t71610\tG\tC,A\t9\t9,3', 'AgamP3.chr2L\t71611\tA\tG\t6\t15', 'AgamP3.chr2L\t71613\tC\tT,G\t14\t2,5', 'AgamP3.chr2L\t71616\tG\tA,T,C\t16\t3,1,1', 'AgamP3.chr2L\t71619\tG\tA,T\t16\t1,4', 'AgamP3.chr2L\t71622\tC\tA,T,G\t14\t1,1,5', 'AgamP3.chr2L\t71625\tT\tC,G\t16\

In [8]:
import re

# constraint on target sequence parameters and initial conserved flanks
# only first match per alignment is returned
min_conserved = 20
max_target = 100
min_ns_in_target = 3
max_gapopens_in_target = 2

def seq_repr(alignment):
    # sequence representation of alignment - based on first sequence in alignment
    seq = ''
    for i in range(alignment.get_alignment_length()):
        col = alignment[:, i]
        if col[0] == '-':
            seq += '-'
        elif len(set(col)) == 1:
            seq += col[0]
        else:
            seq += 'N'
    return seq
            
def find_targets(seq, min_conserved, max_target, min_ns_in_target, max_gapopens_in_target):
    
    prog = re.compile('[ACTG]{%d,}([ACTGN-]{,%d})[ACTG]{%d,}' % (min_conserved, max_target, min_conserved))
    match = re.search(prog, seq)
    if match:
        if match.group(1).count('N') >= min_ns_in_target:
            if len(re.findall('-', match.group(1))) <= max_gapopens_in_target:
                return (match.span(), match.span(1))
    return False
    
for alignment in AlignIO.parse("../../data/AgamP3_maf/chr2L.maf", "maf"):
    if len(alignment) >= min_species and alignment.get_alignment_length() >= min_aligned:
        seq = seq_repr(alignment)
        
        ampl = find_targets(seq, min_conserved, max_target, min_ns_in_target, max_gapopens_in_target)
        if ampl:
            print(len(seq))
            print(seq)
            print(alignment.get_alignment_length())
            print(alignment[0].seq)
            print(ampl)
            break
    

257
NNNN-NN-NNNTNGNGNCANC-NAANGNTNNGGNNNNNNNNNNNNNNNNN-NNNNNNNN---N---NNNNNNNN---NNNNNNNNNNNN---NNATNNGAN---NNNNNN--NNNNCCNAGGTNTCNTTNATAAACTTCGTTGCTTCACTNTGTGGNGCNGGNGGTATTCAAGCATTCAAAACCATGCGAACTCTTAGNGCNCTGAGACCNCTACGTGCNATGTCCCGNATGCAGGGAATGAGGGTACGTAANNNNN
257
TTAA-TT-ATATGGAGACAAC-AAAAGTTATGGGCGGTTATGGATAAACT-ACACGATA---A---TAATTCAT---CAAACATTCACT---GGATTTGAC---CCATAC--TTTTCCTAGGTATCTTTGATAAACTTCGTTGCTTCACTTTGTGGAGCTGGTGGTATTCAAGCATTCAAAACCATGCGAACTCTTAGAGCCCTGAGACCACTACGTGCCATGTCCCGTATGCAGGGAATGAGGGTACGTAATTAAA
((130, 252), (150, 232))


In [9]:
# extend flanks until their conservation is high enough
max_ns = 0.1
max_gaps = 0
def extend_flank(seq, flank_span, max_ns, left=True):

    if left:
        max_flank_len = flank_span[0]
    else:
        max_flank_len = len(seq) - flank_span[1]
    for i in range(max_flank_len):
        flank = seq[flank_span[0] : flank_span[1]]
        flank_ns = flank.count('N') / float(len(flank))
        flank_gaps = flank.count('-') / float(len(flank))
        if flank_ns > max_ns or flank_gaps > max_gaps:
            break
        if left:
            flank_span[0] -= 1
        else:
            flank_span[1] += 1
        
    return flank_span

for alignment in AlignIO.parse("../../data/AgamP3_maf/chr2L.maf", "maf"):
    if len(alignment) >= min_species and alignment.get_alignment_length() >= min_aligned:
        seq = seq_repr(alignment)
        
        ampl = find_targets(seq, min_conserved, max_target, min_ns_in_target, max_gapopens_in_target)
        if ampl:
            lflank = extend_flank(seq, [ampl[0][0], ampl[1][0]], max_ns, left=True)
            rflank = extend_flank(seq, [ampl[1][1], ampl[0][1]], max_ns, left=False)
            print(lflank)
            print(rflank)
            
            break

[123, 150]
[232, 255]


In [10]:
# scoring formula for the amplicons - unused

def score_ampl(seq, target, lflank, rflank):
    
    return (lflank[1]-lflank[0] + rflank[1]-rflank[0]) / (target[1]-target[0]) * \
                    seq[target[0]:target[1]].count('N')
i = 0
for alignment in AlignIO.parse("../data/AgamP3_maf/chr2L.maf", "maf"):
    if len(alignment) >= min_species and alignment.get_alignment_length() >= min_aligned:
        seq = seq_repr(alignment)
        
        ampl = find_targets(seq, min_conserved, max_target, min_ns_in_target, max_gapopens_in_target)
        if ampl:
            lflank = extend_flank(seq, [ampl[0][0], ampl[1][0]], max_ns, left=True)
            rflank = extend_flank(seq, [ampl[1][1], ampl[0][1]], max_ns, left=False)
            if rflank[1] - lflank[0] > 200:
                print(len(seq))
                print(alignment[0].annotations['start'])
                print(seq)
                print(ampl)
                print(lflank)
                print(rflank)
                print(score_ampl(seq, ampl[0], lflank, rflank))
                break

346
13536469
NNN----------------NNNNNNNNNNN-NNNN--------------NNCAGCAANGGATGTCANCGNTGCGTTGTAATTTATCAGAAGCANATAAACATACTNCAGTATAAAGAAAGTGAGTGTGAAGATTTTTATNCNGGCCAGTTGAATCCAGCTGCAATGCCTATTCATAGCGTTTTAAAAAGTGATCGGGGCTTAGCTCATCCCAACATGAACGGGCATTCAAGAATATCNTCATCGGATCCNTATATGCGATCCAATGAAGATAGCGACAATTGTAAGC-AANAG---NNAANNNCTNNNNTTTATNNTTTTTATNAGNGANNNA-TCCNNNN-NN--
((70, 237), (93, 193))
[49, 93]
[193, 288]
3.3293413173652695


In [11]:
%pwd

'/Users/am60/malaria'

In [15]:
# final product filtering and output
min_ampl_len = 190

chromosomes = ['2L','2R','3L','3R','X']
chrom_files = ["../../data/AgamP3_maf/chr" + chrom + ".maf" for chrom in chromosomes]

targets = ''
flanks = ''

# this actually takes some time
for chrom_file in chrom_files:
    for alignment in AlignIO.parse(chrom_file, "maf"):
        if len(alignment) >= min_species and alignment.get_alignment_length() >= min_aligned:
            seq = seq_repr(alignment)
            ampl = find_targets(seq, min_conserved, max_target, min_ns_in_target, max_gapopens_in_target)
            if ampl:
                lflank = extend_flank(seq, [ampl[0][0], ampl[1][0]], max_ns, left=True)
                rflank = extend_flank(seq, [ampl[1][1], ampl[0][1]], max_ns, left=False)
                if rflank[1] - lflank[0] > min_ampl_len:
                    targets += '\n'.join(snp_count(alignment, ampl[1][0], ampl[1][1])) + '\n' # target
                    flanks += '\n'.join(snp_count(alignment, lflank[0], lflank[1])) + '\n'
                    flanks += '\n'.join(snp_count(alignment, rflank[0], rflank[1])) + '\n'

print(targets[:100])

AgamP3.chr2L	13536532	T	C	15	6
AgamP3.chr2L	13536544	C	A	19	2
AgamP3.chr2L	13536578	A	G	19	2
AgamP3.


In [16]:
with open('../20180129_phylo_ampliseq/anopheles_phylo_targets.txt', 'w') as o:
    o.write(targets)
    
with open('../20180129_phylo_ampliseq/anopheles_phylo_flanks.txt', 'w') as o:
    o.write(flanks)
    

Results are saved into two separate files, these can be imported in Excel as different spreadsheets

## TODO

- Use pandas df and ExcelWriter for output formatting
- Optionally: extract minimally filtered hits, then perform subsequent filtering on that dataset