Create degenerate primer sequences from alignments and internal primer coordinates

In [43]:
import re
import pandas as pd
from Bio import AlignIO
from Bio.Seq import Seq

In [32]:
AMPL_FILE = "data/20180706_comb_ampl.maf"

In [33]:
alns = list(AlignIO.parse(AMPL_FILE, "maf"))

In [102]:
def gapped_coord(seq, coord):
    '''
    Coordinate of i-th letter in sequence within gapped sequence representation
    '''
    i = 0 # nucleotides
    j = 0 # coordinates
    while i < coord:
        if str(seq[j]) != '-':
            i += 1
        j += 1
    return j
gapped_coord('-A-TA-C', 4)

7

In [117]:
def get_primers(alignment, fp, rp, min_alts=2):
    '''
    Locate primer sequences in first sequence in the alignment, 
    return primers with ambiguities coded as [ref/alt/...].
    
    min_alts controls for minimum number of genomes with alt alleles to be treated as ambiguous.
    Positions with lower number of alts are converted to lowercase.
    '''
    
    
    seq = ['', '', 0, 0]
    # search in 0-th sequence without gaps
    source = str(alignment[0].seq).replace('-','')
    
    # forward
    pmatch = re.search(fp, source, re.IGNORECASE)
    seq[2] = str(pmatch.start())
    for i in range(gapped_coord(alignment[0], pmatch.start()), gapped_coord(alignment[0], pmatch.end())):
        col = alignment[:, i]
        ref = col[0]
        if len(set(col)) == 1:
            seq[0] += ref
        else:
            alt = set(col)
            alt.remove(ref)
            seq[0] += '[{}/{}]'.format(ref, '/'.join(alt))
    
    # reverse
    rc_rp = Seq(rp).reverse_complement()
    pmatch = re.search(str(rc_rp), source, re.IGNORECASE)
    seq[3] = str(pmatch.end()-1)
    for i in range(gapped_coord(alignment[0], pmatch.end()) - 1,
                   gapped_coord(alignment[0], pmatch.start()) - 1,
                   -1):
        col = [str(Seq(nt).reverse_complement()) for nt in alignment[:, i]]
        ref = col[0]
        alt = set(col)
        alt.remove(ref)
        if len(alt) == 0: # no alts
            seq[1] += ref
        elif len(alt) == 1 and col.count(''.join(alt)) < min_alts: # single occurence of alt in alignment
            seq[1] += ref.lower()
        else:
            #print(''.join(col))
            seq[1] += '[{}/{}]'.format(ref, '/'.join(alt))
        
    return seq

get_primers(alns[508], 'ccttcatgaacagacggttgt', 'gagcacatcattgccggt')

['CCTTCAT[G/A]AACAGACGGTTGT', 'GAGCACAT[C/T]AT[T/C]GCCGGT', '20', '209']

In [105]:
all_primers = '''531_cons	acattagtgctttattacgtggt	catttctttggcactttctgat
435_var	ttttccacgattacacgacctg	gatctgtggtgagtacaatcga
1_cons	gccttctttgtctggagtctaa	aatctgaagccgatcttgaaga
36_cons	caaggccatgaagcaggt	agtagatgacggcgacct
473_var	tgctatggtacctaaacctacg	cgatgggtcaaacggaatcg
193_var	gccatgacgatatctacaatgc	gcgaacgctgattctaattttg
508_cons	ccttcatgaacagacggttgt	gagcacatcattgccggt
126_var	ttcaggagctcaccaaatcg	tccttcacctgttgtactgtat
200_cons	tactttgccgtccaaccg	agtaaaagtcacactattccct
495_var	ttcctcgcacactcatgtaag	tctcgatgtagaactgcacg
416_var	aactggcactagtcatgtacat	caattaattccgttagcccgtg
577_X	acacttgtactcggagatgc	acaactccaacatctccaagat
120_cons	gatgttcatgatgaagcgcac	gaaaacctcgagctggagatt
283_var	cagaactatattaaacattaatcgc	aaagtattcaacgacctcgacc
205_cons	agacacaaggccttgaagaag	acagcttcttcttgtcatcgta
579_X	cacctgacactcgatggg	catgcagccggctacttg
575_X	agactttcaggccatgatacc	tcagtgaacgttgacttggtaa
576_X	aggagatgcaagttactatcgc	cactatcgtagcccaggttc
582_X	gaggcgcaggacacgatc	ctcctggatcttctgcatctc
587_X	tttcagcagaaacatccacatg	gtttcgatgtacttggtggttt
474_other	tgaattatgattaatggtgtgcc	caagtgggtgtgacattcca
298_other	tagaattacaatcaagtggaagacg	ttatacagtggatggcagtcg
433_other	aatgcttccattaactgccatt	ggttaagttttaatgtgcgtaaca
280_other	ttaaacgaaagaagaagaaatc	acggaggtcgatgggtag
248_other	tactggatgatgatcgtgttct	tcttctccaagggtttctacag
471_other	ttccatcttccggatttat	ttgcgcaccatctggctg
567_other	tcggatgcaaatactgttatga	catacaccatctatccggctag
439_other	gtgacagataaattcgatcgc	tttatcaccgaaagaccgca
356_other	tctatcaccatgcacttcttgt	aatgtagataggctaaaaccgc
140_other	gtgaattagattttaatgaagtaa	ttttagatctgtgttgctctcg
297_other	ccatgatgtgctaatctgctta	ttgatccctagagaaaacggtc
299_other	tgttaaggtatttagaattacaatca	ttatacagtggatggcagtcg
269_other	aatgattcggatgatttgcttc	ccgtcattatttcccaatcaaa
65_other	gcaaaatttccgtcccatta	gtttgattaccgacccttcaag
101_other	atcgatgccaaggtgtatcag	aacgtgaagaagagatcgagtc
245_other	cgcaattgtcacaaatctgtat	gcagtttacggcttttctcc'''

In [123]:
for marker in all_primers.split('\n'):
    mdata = marker.split('\t')
    mid = int(mdata[0].split('_')[0])
    print('\t'.join(get_primers(alns[mid], mdata[1], mdata[2])[:2]))

ACATTAGTGCTTTATTACGTGGT	CATTTCTTTGGCACTTTCTGAT
TTTTCCACGATTACACGACCTG	GATCTGTGGT[G/A]AGT[A/T/G][C/-/T/A/G][A/-/T/C][A/-][T/-/C][C/-/T][G/-/A][A/-]
GCCTTCTTTGTCTGGAGTCTAA	A[A/G]TC[T/C/A]GA[A/G]gC[C/T/G]GA[T/C]CT[T/A]GAaGA
CAAGGCCATGAAGCAGGT	AGTAGATgACgGC[G/A]ACcT
TGCTATGGTACCTAAACCTACG	CGATGGGTCAAACGGAATCG
GCCATGACGATATCTACAATGC	GCGAACGCTGATTCTAATTTTG
CCTTCAT[G/A]AACAGACGGTTGT	GAGCACAT[C/T]AT[T/C]GCCGGT
TTCAGGAGCTCACCAAATCG	TCCTTCACCTGTTGTACTGTAT
TACTTTGCCGTCCAACCG	AGTAAAAGTCAcACtAtT[C/T][C/G]Ct
TTCCTCGCACACTCATGTAAG	TCTCGATGTAGAACTGCACg
AACT[G/T]GCACTAGTCATGTACAT	CAATTAATTCCGTTAGCCCGTG
ACACTTGTACTC[G/C]GAGATGC	ACAA[C/T]TC[C/G]AACATcTC[C/G]AA[G/A]AT
GATGTTCAT[G/A]AT[G/A]AAGCGCAC	GA[A/C/G]AA[C/T]CT[C/T/G]GAGC[T/A]GGAGAT[T/C]
CAGAACTATATTAAACATTAATCG[C/T]	AAAGTATTCAaCGaC[C/-]TC[G/A][A/-/C]C[C/A]
AGACACAAGGCCTTGAAGAAG	ACAGCTTCTTCTTGTCaTCGTA
[C/T]ACCTGACA[C/T]TCGAT[G/C]GG	[C/-][A/-/C][T/-/C][G/-/C][C/-/T]AGCCGGCTA[C/T][T/C]TG
[A/G]GA[C/T]TTTCAGGCCATGAT[A/C]CC	TC[A/T/C/G]GTGAA[C/G]GTTGA[C/T]