Perform QC for the 62 primer pairs designed:

- create degenerate primer sequences to assert coordinate and sequence ambiguity correctness
- get alignment statistics for internal part of product
- blast versus outgroup dipterans

In [1]:
# read_table used by pybedtools
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import os
import re
import sys
import pandas as pd
import numpy as np
import pybedtools
from Bio import AlignIO
from Bio.Seq import Seq
from Bio.Phylo.TreeConstruction import DistanceCalculator, DistanceTreeConstructor
from Bio.Blast.Applications import NcbiblastnCommandline
from Bio.Blast import NCBIXML
from collections import defaultdict

In [2]:
AMPL_FILE = 'data/20180927_62_markers.xlsx'
ANN_FILE = 'data/20180927_62_markers_ann.csv' # copied to ../../data/panel_info.csv
ALN_FILE = 'data/20180927_62_markers.maf'
DATA_DIR = "../../../data/"
CHR_FILE = DATA_DIR + "comparative/AgamP3_C21/chr{}.maf"
GENES_BED = pybedtools.BedTool(DATA_DIR + "genome/AgamP3/Anopheles-gambiae-PEST_BASEFEATURES_AgamP3.8.gff3")
REPEATS_BED = pybedtools.BedTool(DATA_DIR + "genome/AgamP3/Anopheles-gambiae-PEST_REPEATFEATURES_AgamP3.gff3")
COMB_DB = DATA_DIR + 'genome/AgAaCpDm'

## Import amplicon data

In [3]:
ampl_data = pd.read_excel(AMPL_FILE)
ampl_data.head()

Unnamed: 0,ID,SNP,F,R,chr,start,end
0,531_cons,,ACATTAGTGCTTTATTACGTGGT,CATTTCTTTGGCACTTTCTGAT,3R,44497305,44497495
1,432_cons,,ACCTGCATTCGTATGTGTTCTA,TTCGAGGTGGGAGATTTTCTTC,3R,4532316,4532529
2,473_var,,TGCTATGGTACCTAAACCTACG,CGATGGGTCAAACGGAATCG,3R,20821297,20821521
3,193_var,,GCCATGACGATATCTACAATGC,GCGAACGCTGATTCTAATTTTG,2R,16469698,16469932
4,126_var,low freq,TTCAGGAGCTCACCAAATCG,TCCTTCACCTGTTGTACTGTAT,2L,45852869,45853061


In [4]:
# sort amplicons by chromosome and start 
ampl_data = ampl_data.sort_values(by=['chr','start']).reset_index(drop=True)
ampl_data.head(10)

Unnamed: 0,ID,SNP,F,R,chr,start,end
0,31_var,high freq,TGTSTACGGTCTGAAGAACATc,TTATCCGGCTCCAAGTTAAGG,2L,10356417,10356608
1,33_other,low freq,GAGCGtGCGGCcAAGATG,ACAgACCGACGTTAATGGC,2L,10357458,10357648
2,44_other,low freq,CAGTCAAATTTCCAGACAATCT,CGGAAGTGCATTTGAAGG-AAaA,2L,12084495,12084690
3,47_other,low freq,GaTATAAATTGTCGATCACACAAACT,TGCATTTATCGTAGTACAATCTCA,2L,13862864,13863055
4,58_cons,high freq,ATGcTBGTCATgATGATGATCT,CCGATCCACGATAAGGAGTAC,2L,18693481,18693714
5,59_other,low freq,GCTGGCGCATAATTATCaCAAA,tTTCCACTTCATCGCTCGC,2L,19065171,19065363
6,65_other,,GCAAAATTTCCGTCCCATTA,TGTAATTAGCTGTGTCTTGTG,2L,20840388,20840620
7,77_var,high freq,GTcTCgGAGCACATYGTG,TCGTACTTCATTATTCTTTGGACTG,2L,26760700,26760890
8,83_var,high freq,AGTGRCTCCAGACGGTgTT,CCAAGGATTTGCTACTACCAcT,2L,27674646,27674836
9,112_cons,high freq,GTATGTAGCGDGGTCCATTG,GTGCTATCGTTAAGAGGTCGTA,2L,40854755,40854948


## Extract alignments for amplicons

In [5]:
def ann_to_aln_coord(aln, ann_coord, ref=0):
    '''
    Transforms annotation coordinate into maf alignment position
    accounting for number of gaps in ref (i-th seq in alignment)
    '''
    gapped_coord = ann_coord - aln[ref].annotations['start']
    out_coord = 0
    i = 0
    while i < gapped_coord:
        if aln[ref][out_coord] != '-':
            i += 1
        out_coord += 1
    return out_coord
def get_subaln(alns, intervals):
    """
    Given maf file with alignments and pandas iterrows object containing sorted starts and ends, 
    yield list of subalignments matching these coordinates
    """
    def se_from_row(row):
        return (row[1]['start'], row[1]['end'])
    subalns = []
    (start, end) = se_from_row(next(intervals))
    alns_iter = AlignIO.parse(alns, "maf")
    alignment = next(alns_iter)
    while True:
        # quickly skip alignments until end coordinate falls within alignment - assume slignments coordinate-sorted
        if alignment[0].annotations['start'] + alignment.get_alignment_length() < end:
            alignment = next(alns_iter)
            continue
        # more thorough check accounting for gaps in alignment
        if alignment[0].annotations['start'] \
               + alignment.get_alignment_length() \
               - (alignment[0].seq).count('-') < end:
            alignment = next(alns_iter)
            continue
        try:
            #print(alignment[0].annotations, str(alignment[0].seq))
            in_start = ann_to_aln_coord(alignment, start)
            in_end = ann_to_aln_coord(alignment, end)
        except:
            raise ValueError('Coordniate error!\nSequence: {}\nAnnotation start: {}\n'
                             'Start: {}\nEnd: '.format(alignment[0].seq, alignment[0].annotations['start'], start, end))
        subalns.append(alignment[:, in_start:in_end])
        try:
            (start, end) = se_from_row(next(intervals))
        except:
            return subalns
if not os.path.isfile(ALN_FILE):
    x = get_subaln(CHR_FILE.format('2R'), ampl_data.iloc[21:24].iterrows())
    print(x)

In [6]:
# LONG RUN requires reading all mafs
# extract alignments from amplicons
if not os.path.isfile(ALN_FILE):
    ampl_alns = []
    for seqid in ('2L', '2R', '3L', '3R', 'X'):
        ampl_alns.extend(get_subaln(CHR_FILE.format(seqid), 
                                    ampl_data[ampl_data.chr==seqid].iterrows()))
    len(ampl_alns)
else:
    ampl_alns = list(AlignIO.parse(ALN_FILE, "maf"))

In [7]:
# write amplicons 
if not os.path.isfile(ALN_FILE):
    count = 0
    with open(ALN_FILE, "w") as handle:
        for a in ampl_alns:
            count += AlignIO.write(a, handle, "maf")
    count

## Re-create degenerate primers

In [8]:
def get_primer(aln, primer, reverse=False, min_alts=2, debug=False):
    '''
    Locate primer in first sequence in the alignment,
    (at start for F, at end for R)
    return primer (based on input primer length) with ambiguities coded as [ref/alt/...].
    
    Parameters:
    aln - input alignment in MAF format
    primer - primer sequence used to regulate output primer length
    reverse - is primer located on reverse strand
    min_alts - minimum number of genomes with alt alleles to be treated as ambiguous.
    Positions with lower number of alts are converted to lowercase.
    '''
    primer = primer.replace('-','')
    if debug:
        print(aln[0].seq, primer)
    seq = ''
    # get primer lengtb 
    plen = len(primer)
    i = 0
    if reverse:
        pos = aln.get_alignment_length() - 1
        while i < plen:
            if aln[0][pos] != '-':
                i += 1
            
            col = [str(Seq(nt).reverse_complement()) for nt in aln[:, pos]]
            ref = col[0]
            alt = set(col)
            alt.remove(ref)
            if len(alt) == 0: # no alts
                seq += ref
            elif len(alt) == 1 and col.count(''.join(alt)) < min_alts: # single occurence of alt in alignment
                seq += ref.lower()
            else:
                #print(''.join(col))
                seq += '[{}/{}]'.format(ref, '/'.join(alt))
            pos -= 1
    # forward strand primer
    else:
        pos = 0
        while i < plen:
            if aln[0][pos] != '-':
                i += 1
            col = aln[:, pos]
            ref = col[0]
            alt = set(col)
            alt.remove(ref)
            if len(set(col)) == 1:
                seq += ref
            elif len(alt) == 1 and col.count(''.join(alt)) < min_alts: # single occurence of alt in alignment
                seq += ref.lower()
            else:
                alt = set(col)
                alt.remove(ref)
                seq += '[{}/{}]'.format(ref, '/'.join(alt))
            pos += 1
    
    # reverse
    
        
    return seq

print(get_primer(ampl_alns[0], 'TGTSTACGGTCTGAAGAACATc', debug=True))
print(get_primer(ampl_alns[0], 'TTATCCGGCTCCAAGTTAAGG', reverse=True, debug=True))

TGTGTACGGTCTGAAGAACATCCAGGCCGATGAGATGGTGGAGTTCTCCTCCGGACTTAAGGTAA---AA--TA-----A--AGCTGGATT-------C--AA-----T--TCCCCCCGTCCCTCCCGCCCCTGCTATGTGCTAGCG-CAGC---GCTA-------TTAACCG----------TCG-CA-----------------CGCTCGATTACGCT-TACTTCCAGGGCATGGCCCTTAACTTGGAGCCGGATAA TGTSTACGGTCTGAAGAACATc
TGT[G/C]TACGGTCTGAAGAACATc
TGTGTACGGTCTGAAGAACATCCAGGCCGATGAGATGGTGGAGTTCTCCTCCGGACTTAAGGTAA---AA--TA-----A--AGCTGGATT-------C--AA-----T--TCCCCCCGTCCCTCCCGCCCCTGCTATGTGCTAGCG-CAGC---GCTA-------TTAACCG----------TCG-CA-----------------CGCTCGATTACGCT-TACTTCCAGGGCATGGCCCTTAACTTGGAGCCGGATAA TTATCCGGCTCCAAGTTAAGG
TT[A/G]TCCGGCTCCAAGTTAAGG


In [9]:
ampl_data['F_deg'] = np.vectorize(get_primer)(ampl_alns, ampl_data['F'])
ampl_data['R_deg'] = np.vectorize(get_primer)(ampl_alns, ampl_data['R'], reverse=True)
ampl_data

Unnamed: 0,ID,SNP,F,R,chr,start,end,F_deg,R_deg
0,31_var,high freq,TGTSTACGGTCTGAAGAACATc,TTATCCGGCTCCAAGTTAAGG,2L,10356417,10356608,TGT[G/C]TACGGTCTGAAGAACATc,TT[A/G]TCCGGCTCCAAGTTAAGG
1,33_other,low freq,GAGCGtGCGGCcAAGATG,ACAgACCGACGTTAATGGC,2L,10357458,10357648,GAGCGtGCGGCcAAGATG,ACAgACCGACGTTAATGGC
2,44_other,low freq,CAGTCAAATTTCCAGACAATCT,CGGAAGTGCATTTGAAGG-AAaA,2L,12084495,12084690,CAGTCAAATTTCCAGACAATCT,CGGAAGTGCATTTGAAGG-AAaA
3,47_other,low freq,GaTATAAATTGTCGATCACACAAACT,TGCATTTATCGTAGTACAATCTCA,2L,13862864,13863055,GaTATAAATTGTCGATCACACAAACT,TGCATTTATCGTAGTACAATCTCA
4,58_cons,high freq,ATGcTBGTCATgATGATGATCT,CCGATCCACGATAAGGAGTAC,2L,18693481,18693714,ATGcT[C/G/T]GTCATgATGATGATCT,CCGATCCACGATAAGGAGTAC
5,59_other,low freq,GCTGGCGCATAATTATCaCAAA,tTTCCACTTCATCGCTCGC,2L,19065171,19065363,GCTGGCGCATAATTATCaCAAA,tTTCCACTTCATCGCTCGC
6,65_other,,GCAAAATTTCCGTCCCATTA,TGTAATTAGCTGTGTCTTGTG,2L,20840388,20840620,GCAAAATTTCCGTCCCATTA,TGTAATTAGCTGTGTCTTGTG
7,77_var,high freq,GTcTCgGAGCACATYGTG,TCGTACTTCATTATTCTTTGGACTG,2L,26760700,26760890,GTcTCgGAGCACAT[C/T]GTG,TCGTACTTCATTATTCTTTGGACTG
8,83_var,high freq,AGTGRCTCCAGACGGTgTT,CCAAGGATTTGCTACTACCAcT,2L,27674646,27674836,AGTG[A/G]CTCCAGACGGTgTT,CC[A/G/T]AGGATTTGCTACTACCAcT
9,112_cons,high freq,GTATGTAGCGDGGTCCATTG,GTGCTATCGTTAAGAGGTCGTA,2L,40854755,40854948,G[T/A]ATGTAGCG[G/A/T]GGTCCATTG,GTGCTATCGTTAAGAGGTCGTA


## Alignment statistics

In [10]:
def identical_clusters(aln, fp, rp):
    '''
    Given alignment, forward and reverse primers, 
    for insert located between primers,
    return list of sets with species IDs with identical sequences
    and number of discriminated lineages'''
    
    ins = aln[:, len(fp.replace('-','')):-len(rp.replace('-',''))]
    
    ids = [set()]
    dm = DistanceCalculator('identity').get_distance(ins)
    dm.names = [n.split('.')[0] for n in dm.names]
    for i in range(len(dm)):
        for j in range(i + 1, len(dm)):
            if dm[i,j] == 0:
                n1 = dm.names[i]
                n2 = dm.names[j]
                for cl in ids:
                    if (n1 in cl):
                        if (n2 in cl):
                            break
                        if (n2 not in cl):
                            cl.add(n2)
                            break
                else:
                    ids.append(set((n1, n2)))
    
    id_clusters = ids[1:]
    
    discrim = len(dm) - sum([len(cl)-1 for cl in id_clusters]) 
        
    return (id_clusters, discrim)
identical_clusters(ampl_alns[0], 'TGTSTACGGTCTGAAGAACATc', 'TTATCCGGCTCCAAGTTAAGG')

([{'AsteI2', 'AsteS1'}], 20)

In [11]:
ampl_data['identical_seq'],ampl_data['idenified_lineages'] = np.vectorize(identical_clusters)(ampl_alns, 
                                                                                ampl_data['F'], 
                                                                                ampl_data['R'])
ampl_data

Unnamed: 0,ID,SNP,F,R,chr,start,end,F_deg,R_deg,identical_seq,idenified_lineages
0,31_var,high freq,TGTSTACGGTCTGAAGAACATc,TTATCCGGCTCCAAGTTAAGG,2L,10356417,10356608,TGT[G/C]TACGGTCTGAAGAACATc,TT[A/G]TCCGGCTCCAAGTTAAGG,"[{AsteS1, AsteI2}]",20
1,33_other,low freq,GAGCGtGCGGCcAAGATG,ACAgACCGACGTTAATGGC,2L,10357458,10357648,GAGCGtGCGGCcAAGATG,ACAgACCGACGTTAATGGC,"[{AgamS1, AmerM1, AmelC1, AgamM1, AquaS1, Aara...",13
2,44_other,low freq,CAGTCAAATTTCCAGACAATCT,CGGAAGTGCATTTGAAGG-AAaA,2L,12084495,12084690,CAGTCAAATTTCCAGACAATCT,CGGAAGTGCATTTGAAGG-AAaA,"[{AgamS1, AgamM1, AgamP3}]",14
3,47_other,low freq,GaTATAAATTGTCGATCACACAAACT,TGCATTTATCGTAGTACAATCTCA,2L,13862864,13863055,GaTATAAATTGTCGATCACACAAACT,TGCATTTATCGTAGTACAATCTCA,"[{AgamS1, AmerM1, AmelC1, AgamM1, AquaS1, Aara...",13
4,58_cons,high freq,ATGcTBGTCATgATGATGATCT,CCGATCCACGATAAGGAGTAC,2L,18693481,18693714,ATGcT[C/G/T]GTCATgATGATGATCT,CCGATCCACGATAAGGAGTAC,"[{AgamS1, AmerM1, AgamM1, AquaS1, AaraD1, Agam...",15
5,59_other,low freq,GCTGGCGCATAATTATCaCAAA,tTTCCACTTCATCGCTCGC,2L,19065171,19065363,GCTGGCGCATAATTATCaCAAA,tTTCCACTTCATCGCTCGC,"[{AsteS1, AsteI2}]",15
6,65_other,,GCAAAATTTCCGTCCCATTA,TGTAATTAGCTGTGTCTTGTG,2L,20840388,20840620,GCAAAATTTCCGTCCCATTA,TGTAATTAGCTGTGTCTTGTG,[],15
7,77_var,high freq,GTcTCgGAGCACATYGTG,TCGTACTTCATTATTCTTTGGACTG,2L,26760700,26760890,GTcTCgGAGCACAT[C/T]GTG,TCGTACTTCATTATTCTTTGGACTG,"[{AgamM1, AgamP3}]",20
8,83_var,high freq,AGTGRCTCCAGACGGTgTT,CCAAGGATTTGCTACTACCAcT,2L,27674646,27674836,AGTG[A/G]CTCCAGACGGTgTT,CC[A/G/T]AGGATTTGCTACTACCAcT,"[{AsteS1, AsteI2}]",20
9,112_cons,high freq,GTATGTAGCGDGGTCCATTG,GTGCTATCGTTAAGAGGTCGTA,2L,40854755,40854948,G[T/A]ATGTAGCG[G/A/T]GGTCCATTG,GTGCTATCGTTAAGAGGTCGTA,"[{AquaS1, AgamS1, AgamP3}, {AgamM1, AaraD1}, {...",15


In [12]:
ALL_SP = ['AgamP3', 'AgamS1', 'AgamM1', 'AmerM1', 'AaraD1',
         'AquaS1', 'AmelC1', 'AchrA1', 'AepiE1', 'AminM1',
         'AculA1', 'AfunF1', 'AsteS1', 'AsteI2', 'AmacM1',
         'AfarF1', 'AdirW1', 'AsinS1', 'AatrE1', 'AdarC2',
         'AalbS1']

def seq_repr(alignment):
    '''
    Given multiple sequence alignment, return first sequence with Ns for ambiguous chars and X's for indels.'''
    seq = ''
    for i in range(alignment.get_alignment_length()):
        col = alignment[:, i]
        if '-' in col: # indel has higher priority than substitution
            seq += 'X'
        elif len(set(col)) == 1:
            seq += col[0]
        else:
            seq += 'N'
    return seq

def get_aln_stats(aln, fp, rp):
    
    ins = aln[:, len(fp.replace('-','')):-len(rp.replace('-',''))]
    aln_seq = seq_repr(aln)
    ins_seq = seq_repr(ins)
    aln_sp = [seq.id.split('.')[0] for seq in aln]
    unaln_sp = ';'.join([sp for sp in ALL_SP if sp not in aln_sp])
    
    return (len(aln), # number of aligned species
           unaln_sp, # unaligned species IDs
           aln.get_alignment_length(),
           len(str(aln[0].seq).replace('-','')), # agam amplicon length
           ins.get_alignment_length(),
           len(str(ins[0].seq).replace('-','')), # agam insert length
           aln_seq.count('N'),
           aln_seq.count('X'),
           ins_seq.count('N'),
           ins_seq.count('X'),
          )
get_aln_stats(ampl_alns[2], 'CAGTCAAATTTCCAGACAATCT', 'CGGAAGTGCATTTGAAGG-AAaA')

(16, 'AchrA1;AsinS1;AatrE1;AdarC2;AalbS1', 234, 195, 190, 152, 42, 81, 41, 80)

In [13]:
(ampl_data['aligned_spp'], ampl_data['unaligned_spp'],
 ampl_data['aligned_ampl_length'], ampl_data['agam_ampl_length'],
 ampl_data['aligned_insert_length'], ampl_data['agam_insert_length'],
 ampl_data['total_snvs'], ampl_data['total_indels'],
 ampl_data['insert_snvs'], ampl_data['insert_indels']) = np.vectorize(get_aln_stats)(ampl_alns, 
                                                                                ampl_data['F'], 
                                                                                ampl_data['R'])
ampl_data.head()

Unnamed: 0,ID,SNP,F,R,chr,start,end,F_deg,R_deg,identical_seq,...,aligned_spp,unaligned_spp,aligned_ampl_length,agam_ampl_length,aligned_insert_length,agam_insert_length,total_snvs,total_indels,insert_snvs,insert_indels
0,31_var,high freq,TGTSTACGGTCTGAAGAACATc,TTATCCGGCTCCAAGTTAAGG,2L,10356417,10356608,TGT[G/C]TACGGTCTGAAGAACATc,TT[A/G]TCCGGCTCCAAGTTAAGG,"[{AsteS1, AsteI2}]",...,21,,259,191,216,148,33,139,30,139
1,33_other,low freq,GAGCGtGCGGCcAAGATG,ACAgACCGACGTTAATGGC,2L,10357458,10357648,GAGCGtGCGGCcAAGATG,ACAgACCGACGTTAATGGC,"[{AgamS1, AmerM1, AmelC1, AgamM1, AquaS1, Aara...",...,21,,190,190,153,153,36,0,33,0
2,44_other,low freq,CAGTCAAATTTCCAGACAATCT,CGGAAGTGCATTTGAAGG-AAaA,2L,12084495,12084690,CAGTCAAATTTCCAGACAATCT,CGGAAGTGCATTTGAAGG-AAaA,"[{AgamS1, AgamM1, AgamP3}]",...,16,AchrA1;AsinS1;AatrE1;AdarC2;AalbS1,234,195,190,152,42,81,41,80
3,47_other,low freq,GaTATAAATTGTCGATCACACAAACT,TGCATTTATCGTAGTACAATCTCA,2L,13862864,13863055,GaTATAAATTGTCGATCACACAAACT,TGCATTTATCGTAGTACAATCTCA,"[{AgamS1, AmerM1, AmelC1, AgamM1, AquaS1, Aara...",...,21,,255,191,205,141,12,107,11,107
4,58_cons,high freq,ATGcTBGTCATgATGATGATCT,CCGATCCACGATAAGGAGTAC,2L,18693481,18693714,ATGcT[C/G/T]GTCATgATGATGATCT,CCGATCCACGATAAGGAGTAC,"[{AgamS1, AmerM1, AgamM1, AquaS1, AaraD1, Agam...",...,21,,270,233,227,190,32,74,29,74


In [14]:
ampl_data['prop_id_lineages'] = ampl_data['idenified_lineages']/ampl_data['aligned_spp']
ampl_data.head()

Unnamed: 0,ID,SNP,F,R,chr,start,end,F_deg,R_deg,identical_seq,...,unaligned_spp,aligned_ampl_length,agam_ampl_length,aligned_insert_length,agam_insert_length,total_snvs,total_indels,insert_snvs,insert_indels,prop_id_lineages
0,31_var,high freq,TGTSTACGGTCTGAAGAACATc,TTATCCGGCTCCAAGTTAAGG,2L,10356417,10356608,TGT[G/C]TACGGTCTGAAGAACATc,TT[A/G]TCCGGCTCCAAGTTAAGG,"[{AsteS1, AsteI2}]",...,,259,191,216,148,33,139,30,139,0.952381
1,33_other,low freq,GAGCGtGCGGCcAAGATG,ACAgACCGACGTTAATGGC,2L,10357458,10357648,GAGCGtGCGGCcAAGATG,ACAgACCGACGTTAATGGC,"[{AgamS1, AmerM1, AmelC1, AgamM1, AquaS1, Aara...",...,,190,190,153,153,36,0,33,0,0.619048
2,44_other,low freq,CAGTCAAATTTCCAGACAATCT,CGGAAGTGCATTTGAAGG-AAaA,2L,12084495,12084690,CAGTCAAATTTCCAGACAATCT,CGGAAGTGCATTTGAAGG-AAaA,"[{AgamS1, AgamM1, AgamP3}]",...,AchrA1;AsinS1;AatrE1;AdarC2;AalbS1,234,195,190,152,42,81,41,80,0.875
3,47_other,low freq,GaTATAAATTGTCGATCACACAAACT,TGCATTTATCGTAGTACAATCTCA,2L,13862864,13863055,GaTATAAATTGTCGATCACACAAACT,TGCATTTATCGTAGTACAATCTCA,"[{AgamS1, AmerM1, AmelC1, AgamM1, AquaS1, Aara...",...,,255,191,205,141,12,107,11,107,0.619048
4,58_cons,high freq,ATGcTBGTCATgATGATGATCT,CCGATCCACGATAAGGAGTAC,2L,18693481,18693714,ATGcT[C/G/T]GTCATgATGATGATCT,CCGATCCACGATAAGGAGTAC,"[{AgamS1, AmerM1, AgamM1, AquaS1, AaraD1, Agam...",...,,270,233,227,190,32,74,29,74,0.714286


## Gene annotation

In [15]:
# create list of BED intervals for amplicons
amplicon_beds = ampl_data[['chr', 'start', 'end']].to_string(header=False, index=False).split('\n')
amplicon_beds[0]

' 2L  10356417  10356608'

In [16]:
def bt_to_df(bt):
    '''
    Convert bedtool to pandas dataframe replacing empty files with None'''
    if len(bt) > 0:
        return bt.to_dataframe()
    else:
        return None
    
def annotate_interval(bed_str, genes, repeats):
    '''
    Annotate interval in string format genes and repats annotation tracks
    '''
    def bt_to_df(bt):
        '''
        Convert bedtool to pandas dataframe'''
        if len(bt) > 0:
            return bt.to_dataframe()
        else:
            return pd.DataFrame()

    def get_attrs(d, feature, attr_id):
        '''
        From gff dataframe extract list of features by attribute ID
        Attribute string example for gene feature:
        ID=AGAP001235;biotype=protein_coding
        '''
        out = []
        try:
            for attr in d[d.feature == feature]['attributes']:
                for a in attr.split(';'):
                    aa = a.split('=')
                    if aa[0] == attr_id:
                        out.append(aa[1])
            if len(out) > 0:
                return ';'.join(out)
        except: # no annotations
            pass
            
        return 'None'
    
    attr_dict = dict()
    
    # intersect
    a_bed = pybedtools.BedTool(bed_str, from_string=True) 
    
    ag_gff = genes.intersect(a_bed)
    ar_gff = repeats.intersect(a_bed)
    # convert annotations to dataframe
    ampl_annot = pd.concat([bt_to_df(ag_gff), bt_to_df(ar_gff)])
    # convert gff coordinate to BED coordinate
    ampl_annot['start'] = ampl_annot.start - 1
    # generate gene and exon beds
    gene_bed = ampl_annot.loc[ampl_annot.feature == 'gene', 
                              ['seqname','start','end']].to_string(header=False, index=False)
    # intergenic, avoid empty bed operation
    if gene_bed.startswith('Empty'):
        attr_dict['intron'] = 'No'
        attr_dict['intergenic'] = 'Yes'
    else:
        gene_bed = pybedtools.BedTool(gene_bed, from_string=True)
        exon_bed = ampl_annot.loc[ampl_annot.feature == 'exon', 
                                  ['seqname','start','end']].to_string(header=False, index=False)
        # intron, avoid empty bed operation
        if exon_bed.startswith('Empty'):
            attr_dict['intron'] = 'Yes'
            attr_dict['intergenic'] = 'No'
        else:
            exon_bed = pybedtools.BedTool(exon_bed, from_string=True)
            # generate intergenic bed based on gene bed subtraction from amplicon bed
            intergenic = bt_to_df(a_bed.subtract(gene_bed))
            # generate intronic bed based on exon bed subtraction from amplicon bed
            intron = bt_to_df(a_bed.subtract(exon_bed))
            # all non-exonic sequences are intergenic - exonic terminus only
            if (intron.shape[0] > 0) and intron.equals(intergenic):
                attr_dict['intron'] = 'No'
                attr_dict['intergenic'] = 'Yes'
            else:
                attr_dict['intron'] = ('Yes' if intron.shape[0] > 0 else 'No')
                attr_dict['intergenic'] = ('Yes' if intergenic.shape[0] > 0 else 'No')
    attr_dict.update({
    'gene': get_attrs(ampl_annot, 'gene', 'ID'),
    'mRNA': get_attrs(ampl_annot, 'mRNA', 'ID'),
    'exon': get_attrs(ampl_annot, 'exon', 'ID'),
    'repeat': get_attrs(ampl_annot, 'repeat', 'Name'),
    })
    # predict utr by name
    attr_dict['utr'] = ('Yes' if ('utr' in str(ampl_annot['feature'])) else 'No')
    
    return attr_dict

annotate_interval(amplicon_beds[1], GENES_BED, REPEATS_BED)

{'intron': 'No',
 'intergenic': 'No',
 'gene': 'AGAP005134',
 'mRNA': 'AGAP005134-RA',
 'exon': 'E016100A',
 'repeat': 'None',
 'utr': 'No'}

In [17]:
ann_dict = dict()
for (i, bed) in enumerate(amplicon_beds):
    sys.stdout.write('\r{}'.format(i))
    ann_dict[i] = annotate_interval(bed, GENES_BED, REPEATS_BED)
# add annotation columns
print('\nDone!')

61
Done!


In [18]:
ampl_data = pd.concat([ampl_data, pd.DataFrame(ann_dict).T], axis=1)
ampl_data['repeat'].value_counts()

None    62
Name: repeat, dtype: int64

In [19]:
# genes
ampl_data['gene'].value_counts()

None          22
AGAP003271     3
AGAP005134     2
AGAP004405     1
AGAP000562     1
AGAP001497     1
AGAP002748     1
AGAP011422     1
AGAP002350     1
AGAP009860     1
AGAP006176     1
AGAP004000     1
AGAP008919     1
AGAP004648     1
AGAP006107     1
AGAP001806     1
AGAP010335     1
AGAP007831     1
AGAP007643     1
AGAP011194     1
AGAP003997     1
AGAP007008     1
AGAP008028     1
AGAP008077     1
AGAP012014     1
AGAP010147     1
AGAP007340     1
AGAP000054     1
AGAP027996     1
AGAP000061     1
AGAP002150     1
AGAP005681     1
AGAP012048     1
AGAP009126     1
AGAP000974     1
AGAP011116     1
AGAP000962     1
AGAP001435     1
Name: gene, dtype: int64

In [20]:
# intron
ampl_data['intron'].value_counts()

No     40
Yes    22
Name: intron, dtype: int64

In [21]:
# intergenic
ampl_data['intergenic'].value_counts()

No     39
Yes    23
Name: intergenic, dtype: int64

In [22]:
# exon
ampl_data[(ampl_data.intron == 'No') & (ampl_data.intergenic == 'No')].shape

(17, 29)

In [23]:
# both intron an intergenic spanned
ampl_data[(ampl_data.intron == 'Yes') & (ampl_data.intergenic == 'Yes')]

Unnamed: 0,ID,SNP,F,R,chr,start,end,F_deg,R_deg,identical_seq,...,insert_snvs,insert_indels,prop_id_lineages,exon,gene,intergenic,intron,mRNA,repeat,utr


## BLAST vs diptera

In [24]:
# extract agam sequences for markers removing alignment gaps
# headers are indices of amplicons
agam_fa = 'data/temp_ampl.fasta'

with open('data/temp_ampl.fasta', 'w') as o:
    i = 0
    for a in ampl_alns:
        o.write(">{}\n".format(i))
        o.write(str(a[0].seq).replace('-',''))
        o.write('\n')
        i += 1
    
!head {agam_fa}

>0
TGTGTACGGTCTGAAGAACATCCAGGCCGATGAGATGGTGGAGTTCTCCTCCGGACTTAAGGTAAAATAAAGCTGGATTCAATTCCCCCCGTCCCTCCCGCCCCTGCTATGTGCTAGCGCAGCGCTATTAACCGTCGCACGCTCGATTACGCTTACTTCCAGGGCATGGCCCTTAACTTGGAGCCGGATAA
>1
GAGCGTGCGGCCAAGATGAGCCCGACGCTCGGTGGCGGTTCGCTCACTGCCCTGCCGGTCATCGAAACCCAGGCCGGTGATGTGTCCGCTTACATTCCAACCAACGTCATCTCGATCACGGACGGACAGATCTTCTTGGAAACTGAGCTGTTCTACAAGGGTATCCGACCGGCCATTAACGTCGGTCTGT
>2
CAGTCAAATTTCCAGACAATCTAATATTTATTGCTCGCTGGCCGGATAAGAAACTCGTTGCGCATTTGATTTCCATTTCTGTCGTCCTGCCGCTGCTCTGGCCCTTTTGCTCGGCCCTCCAGCTAGCCGAGCAGGGCCAAAGATATTCATCTCCGTCTGCATCTCCGGAATTGTTTTCCTTCAAATGCACTTCCG
>3
GATATAAATTGTCGATCACACAAACTATTCTTTGCATTCTTTGGTAACTTATATGTTTGTAGAATATATATATTTACACCTATATACCTAAAACGTAGACAGTGGAAGATTGAAGAACACTTCGCCTCCTGCAGGCGCCGTCCTCTTGGTATCTATAAAAGGAGTCTTGAGATTGTACTACGATAAATGCA
>4
ATGCTCGTCATGATGATGATCTTCCAGAACGAGTTGAGGATGCCGATCCAGACGGCGCCGAAGTGCAGCAGCGGATTGTCGATCGTCTCCCCGGACACCAGCTCGTAGATGCGCAACCCAATGTACGGTGCCCACGAGATCCAGAAGGAAAACACCAATGCGAACGACATGCAGTGACTGGGGTTCGCTAGGTTCTCGGCGAGCG

In [25]:
blast_file = 'data/temp_blast.xml'
cline = NcbiblastnCommandline(cmd='blastn', out=blast_file, outfmt=5, query=agam_fa, 
                              db=COMB_DB, evalue=0.001)
print(cline)
# execute
cline()

blastn -out data/temp_blast.xml -outfmt 5 -query data/temp_ampl.fasta -db /Users/am60/malaria/data/genome/AgAaCpDm -evalue 0.001


('', '')

In [26]:
# read blast output
blast_records = list(NCBIXML.parse(open(blast_file)))
# http://biopython.org/DIST/docs/api/Bio.Blast.Record.HSP-class.html
# str not working because self.bits is not available for some reason
# test_hsp = blast_records[1].alignments[0].hsps[0] 
# print(test_hsp)
# print(test_hsp.identities, test_hsp.align_length)
blast_records[1].alignments[0].hsps[0]

<Bio.Blast.Record.HSP at 0x11e435f60>

In [27]:
# convert blast output
bd = dict()
species = ('AgamP3', 'CpipJ2', 'AaegL5', 'BDGP6')
for record in blast_records:
    q = record.query
    bd[q] = defaultdict(list)
    for aln in record.alignments:
        #extract species from hit definition
        (s,c) = [l.split(':')[1:3] for l in aln.hit_def.split(' ') if len(l.split(':')) > 2][0]
        if s not in species:
            raise ValueError('Unknown genome ' + s)
#         if s not in bd[q].keys():
#             bd[q][s] = defaultdict(list)
        for hsp in aln.hsps:
            if (hsp.num_alignments is not None):
                raise ValueError('Multiple alignmed fragments per HSP:\n' + str(hsp))
            #bd[q][s + '_length'].append(hsp.align_length)
            bd[q][s + '_identity'].append(hsp.identities / hsp.align_length)
            bd[q][s + '_q_span'].append('{}-{}'.format(hsp.query_start, hsp.query_end))
            bd[q][s + '_s_span'].append('{}:{}-{}'.format(c, hsp.sbjct_start, hsp.sbjct_end))
            #bd[q][s + '_expect'].append(hsp.expect)
            i+=1
        #bd[q][s + '_hits'] = len(bd[q][s + '_length'])
    for (k,v) in bd[q].items():
        if type(bd[q][k]) is list:
            bd[q][k] = ';'.join([str(s) for s in v])
            
bd['0']

defaultdict(list,
            {'AgamP3_identity': '1.0',
             'AgamP3_q_span': '1-191',
             'AgamP3_s_span': '2L:10356418-10356608',
             'AaegL5_identity': '0.9696969696969697',
             'AaegL5_q_span': '1-66',
             'AaegL5_s_span': '2:204342881-204342946',
             'CpipJ2_identity': '0.9682539682539683',
             'CpipJ2_q_span': '1-63',
             'CpipJ2_s_span': 'supercont3.30:1160215-1160153',
             'BDGP6_identity': '0.9516129032258065',
             'BDGP6_q_span': '1-62',
             'BDGP6_s_span': '2R:22800075-22800136'})

In [28]:
bd =  {int(k): v for k, v in bd.items()}
bd[1]

defaultdict(list,
            {'AgamP3_identity': '1.0',
             'AgamP3_q_span': '1-190',
             'AgamP3_s_span': '2L:10357459-10357648',
             'CpipJ2_identity': '0.9263157894736842',
             'CpipJ2_q_span': '1-190',
             'CpipJ2_s_span': 'supercont3.30:1159280-1159091',
             'AaegL5_identity': '0.8947368421052632',
             'AaegL5_q_span': '1-190',
             'AaegL5_s_span': '2:204349481-204349670'})

In [29]:
comb_meta = pd.concat([ampl_data, pd.DataFrame(bd).T.fillna('')], axis=1)
comb_meta

Unnamed: 0,ID,SNP,F,R,chr,start,end,F_deg,R_deg,identical_seq,...,AaegL5_s_span,AgamP3_identity,AgamP3_q_span,AgamP3_s_span,BDGP6_identity,BDGP6_q_span,BDGP6_s_span,CpipJ2_identity,CpipJ2_q_span,CpipJ2_s_span
0,31_var,high freq,TGTSTACGGTCTGAAGAACATc,TTATCCGGCTCCAAGTTAAGG,2L,10356417,10356608,TGT[G/C]TACGGTCTGAAGAACATc,TT[A/G]TCCGGCTCCAAGTTAAGG,"[{AsteS1, AsteI2}]",...,2:204342881-204342946,1.0,1-191,2L:10356418-10356608,0.9516129032258065,1-62,2R:22800075-22800136,0.9682539682539683,1-63,supercont3.30:1160215-1160153
1,33_other,low freq,GAGCGtGCGGCcAAGATG,ACAgACCGACGTTAATGGC,2L,10357458,10357648,GAGCGtGCGGCcAAGATG,ACAgACCGACGTTAATGGC,"[{AgamS1, AmerM1, AmelC1, AgamM1, AquaS1, Aara...",...,2:204349481-204349670,1.0,1-190,2L:10357459-10357648,,,,0.9263157894736842,1-190,supercont3.30:1159280-1159091
2,44_other,low freq,CAGTCAAATTTCCAGACAATCT,CGGAAGTGCATTTGAAGG-AAaA,2L,12084495,12084690,CAGTCAAATTTCCAGACAATCT,CGGAAGTGCATTTGAAGG-AAaA,"[{AgamS1, AgamM1, AgamP3}]",...,,1.0,1-195,2L:12084496-12084690,,,,,,
3,47_other,low freq,GaTATAAATTGTCGATCACACAAACT,TGCATTTATCGTAGTACAATCTCA,2L,13862864,13863055,GaTATAAATTGTCGATCACACAAACT,TGCATTTATCGTAGTACAATCTCA,"[{AgamS1, AmerM1, AmelC1, AgamM1, AquaS1, Aara...",...,,1.0,1-191,2L:13862865-13863055,,,,,,
4,58_cons,high freq,ATGcTBGTCATgATGATGATCT,CCGATCCACGATAAGGAGTAC,2L,18693481,18693714,ATGcT[C/G/T]GTCATgATGATGATCT,CCGATCCACGATAAGGAGTAC,"[{AgamS1, AmerM1, AgamM1, AquaS1, AaraD1, Agam...",...,2:74623848-74623616,1.0,1-233,2L:18693482-18693714,,,,0.8340425531914893,1-233,supercont3.137:804142-804374
5,59_other,low freq,GCTGGCGCATAATTATCaCAAA,tTTCCACTTCATCGCTCGC,2L,19065171,19065363,GCTGGCGCATAATTATCaCAAA,tTTCCACTTCATCGCTCGC,"[{AsteS1, AsteI2}]",...,,1.0,1-192,2L:19065172-19065363,,,,,,
6,65_other,,GCAAAATTTCCGTCCCATTA,TGTAATTAGCTGTGTCTTGTG,2L,20840388,20840620,GCAAAATTTCCGTCCCATTA,TGTAATTAGCTGTGTCTTGTG,[],...,,1.0,1-232,2L:20840389-20840620,,,,,,
7,77_var,high freq,GTcTCgGAGCACATYGTG,TCGTACTTCATTATTCTTTGGACTG,2L,26760700,26760890,GTcTCgGAGCACAT[C/T]GTG,TCGTACTTCATTATTCTTTGGACTG,"[{AgamM1, AgamP3}]",...,2:205452944-205453026,1.0,1-190,2L:26760701-26760890,,,,,,
8,83_var,high freq,AGTGRCTCCAGACGGTgTT,CCAAGGATTTGCTACTACCAcT,2L,27674646,27674836,AGTG[A/G]CTCCAGACGGTgTT,CC[A/G/T]AGGATTTGCTACTACCAcT,"[{AsteS1, AsteI2}]",...,,1.0,1-190,2L:27674647-27674836,,,,,,
9,112_cons,high freq,GTATGTAGCGDGGTCCATTG,GTGCTATCGTTAAGAGGTCGTA,2L,40854755,40854948,G[T/A]ATGTAGCG[G/A/T]GGTCCATTG,GTGCTATCGTTAAGAGGTCGTA,"[{AquaS1, AgamS1, AgamP3}, {AgamM1, AaraD1}, {...",...,,1.0,1-193,2L:40854756-40854948,,,,,,


## Check individual BLAST results

In [30]:
def display_primer_matches(idx):
    """
    For marker listed undex idx in annotation dataframe, 
    get primer sequence identities in matching outgroup (not AgamP4) genomes.
    """
    fp = comb_meta.iloc[idx]['F']
    rp = comb_meta.iloc[idx]['R']
    al = comb_meta.iloc[idx]['agam_ampl_length']
    print(blast_records[idx].query)
    for aln in blast_records[idx].alignments:
        if 'AgamP3' not in aln.title:
            print(aln.title.split(':')[2])
            a = aln.hsps[0]
            s = a.query_start - 1
            e = a.query_end
            delta = al - a.align_length
            print(fp + '.'*(3 + delta) + str(Seq(rp).reverse_complement()))
            print('.' * s + a.query[:len(fp)] + '...' + a.query[-len(rp):])
            print('.' * s + a.match[:len(fp)] + '...' + a.match[-len(rp):])
            print('.' * s + a.sbjct[:len(fp)] + '...' + a.sbjct[-len(rp):])
display_primer_matches(22)            

22
AaegL5
TAAAATGATTGCACACCTGcTCGC......CACAAGTCCgACgCGATT
TAAAATGATTGCACACCTGCTCGC...GGTCACAAGTCCGACGCG
||||||||||||| || |||||||... ||||| || ||||||||
TAAAATGATTGCAGACTTGCTCGC...TGTCACCAGGCCGACGCG
CpipJ2
TAAAATGATTGCACACCTGcTCGC..................CACAAGTCCgACgCGATT
TAAAATGATTGCACACCTGCTCGC...TCGCTCGATCGAGGTCAC
||||||||||||| ||||||||||... |||||||| || |||||
TAAAATGATTGCAGACCTGCTCGC...ACGCTCGATTGACGTCAC


In [31]:
display_primer_matches(50)

50
BDGP6
TTCCTCGCACACTCATGTAAG..................................................................................................................................cGTGCAGTTCTACATCGAGA
................................................................................................................................AGGGCCTGGACGCCAACGGGA...CGTGCAGTTCTACATCGAGA
................................................................................................................................|||| ||||| |||||||| |...||||||||||||||||||||
................................................................................................................................AGGGGCTGGATGCCAACGG-A...CGTGCAGTTCTACATCGAGA


In [32]:
comb_meta.columns

Index(['ID', 'SNP', 'F', 'R', 'chr', 'start', 'end', 'F_deg', 'R_deg',
       'identical_seq', 'idenified_lineages', 'aligned_spp', 'unaligned_spp',
       'aligned_ampl_length', 'agam_ampl_length', 'aligned_insert_length',
       'agam_insert_length', 'total_snvs', 'total_indels', 'insert_snvs',
       'insert_indels', 'prop_id_lineages', 'exon', 'gene', 'intergenic',
       'intron', 'mRNA', 'repeat', 'utr', 'AaegL5_identity', 'AaegL5_q_span',
       'AaegL5_s_span', 'AgamP3_identity', 'AgamP3_q_span', 'AgamP3_s_span',
       'BDGP6_identity', 'BDGP6_q_span', 'BDGP6_s_span', 'CpipJ2_identity',
       'CpipJ2_q_span', 'CpipJ2_s_span'],
      dtype='object')

## Write and clean up 

In [33]:
comb_meta.to_csv(ANN_FILE)

In [34]:
!rm {blast_file} {agam_fa}

## TODO
- annotation code clean up