Create degenerate primer sequences from alignments and internal primer coordinates

In [32]:
import re
import sys
import pandas as pd
import numpy as np
from Bio import AlignIO
from Bio.Seq import Seq

In [2]:
# in
AMPL_FILE = "data/20180706_comb_ampl.maf"
PRIMER_FILE = "data/20180914_Simpledesigninfo_updated.xlsx"
# out
OUT_FILE = "data/20180918_Simpledesigninfo_updated.xlsx"

In [6]:
alns = list(AlignIO.parse(AMPL_FILE, "maf"))
primers = pd.read_excel(PRIMER_FILE, sheet_name=None)
primers['Multiplex_core']

Unnamed: 0,ID,TemplateID,ForwardPrimer(Fp),ReversePrimer(Rp),FpTm(Celsius degree),RpTm(Celsius degree),FpGC(%),RpGC(%),FpPos,RpPos,Penalty,AmpSize(bp),SNPs in primer?
0,1,531_cons,acattagtgctttattacgtggt,catttctttggcactttctgat,56.87,56.1,34.78,36.36,18,207,1.9,190,
1,2,432_cons,acctgcattcgtatgtgttcta,ttcgaggtgggagattttcttc,58.12,58.65,40.91,45.46,5,217,0.65,213,
2,3,473_var,tgctatggtacctaaacctacg,cgatgggtcaaacggaatcg,57.66,59.91,45.46,55.0,46,269,3.91,224,
3,4,193_var,gccatgacgatatctacaatgc,gcgaacgctgattctaattttg,57.98,57.58,45.46,40.91,16,249,0.42,234,
4,2,126_var,ttcaggagctcaccaaatcg,tccttcacctgttgtactgtat,58.48,57.08,50.0,40.91,123,314,0.92,192,low freq
5,3,416_var,aactggcactagtcatgtacat,caattaattccgttagcccgtg,57.77,58.63,40.91,45.46,19,212,0.63,194,low freq
6,4,495_var,ttcctcgcacactcatgtaag,tctcgatgtagaactgcacg,58.62,58.02,47.62,50.0,68,261,2.02,194,low freq
7,5,435_var,agttctttggagagttcggaat,aagaaaagaagtgagaaagcga,57.97,56.82,40.91,36.36,48,242,1.18,195,low freq
8,6,283_var,cacacaacttttatgtttaatca,aaaataatggccactattccc,52.53,54.45,26.09,38.1,53,249,4.55,197,low freq
9,10,205_cons,agacacaaggccttgaagaag,acagcttcttcttgtcatcgta,58.73,58.07,47.62,40.91,41,275,0.07,235,low freq


In [7]:
def gapped_coord(seq, coord):
    '''
    Coordinate of i-th letter in sequence within gapped sequence representation
    '''
    i = 0 # nucleotides
    j = 0 # coordinates
    while i < coord:
        if str(seq[j]) != '-':
            i += 1
        j += 1
    return j
gapped_coord('-A-TA-C', 4)

7

In [65]:
def get_primer(alignment, alnid, primer, reverse=False, min_alts=2):
    '''
    Locate primer sequence in first sequence in the alignment, 
    return primer with ambiguities coded as [ref/alt/...].
    
    reverse - primer located on reverse strand
    
    min_alts - minimum number of genomes with alt alleles to be treated as ambiguous.
    Positions with lower number of alts are converted to lowercase.
    '''
#     print(alnid)
#     print(primer)
#     print(alignment)
    
    seq = ''
    alnid = int(alnid)
    # search in 0-th sequence without gaps
    source = str(alignment[alnid][0].seq).replace('-','')
    
    if reverse:
        primer = str(Seq(primer).reverse_complement())
    pmatch = re.search(primer, source, re.IGNORECASE)
    # reverse strand primer
    if reverse:
        for i in range(gapped_coord(alignment[alnid][0], pmatch.end()) - 1,
                       gapped_coord(alignment[alnid][0], pmatch.start()) - 1,
                       -1):
            col = [str(Seq(nt).reverse_complement()) for nt in alignment[alnid][:, i]]
            ref = col[0]
            alt = set(col)
            alt.remove(ref)
            if len(alt) == 0: # no alts
                seq += ref
            elif len(alt) == 1 and col.count(''.join(alt)) < min_alts: # single occurence of alt in alignment
                seq += ref.lower()
            else:
                #print(''.join(col))
                seq += '[{}/{}]'.format(ref, '/'.join(alt))
    # forward strand primer
    else:
        for i in range(gapped_coord(alignment[alnid][0], pmatch.start()), gapped_coord(alignment[alnid][0], pmatch.end())):
            col = alignment[alnid][:, i]
            ref = col[0]
            alt = set(col)
            alt.remove(ref)
            if len(set(col)) == 1:
                seq += ref
            elif len(alt) == 1 and col.count(''.join(alt)) < min_alts: # single occurence of alt in alignment
                seq += ref.lower()
            else:
                alt = set(col)
                alt.remove(ref)
                seq += '[{}/{}]'.format(ref, '/'.join(alt))
    
    # reverse
    
        
    return seq

print(get_primer(alns, 508, 'ccttcatgaacagacggttgt'))
print(get_primer(alns, 508, 'gagcacatcattgccggt', reverse=True))

CCTTCATgAACAGACGGTTGT
GAGCACAT[C/T]AT[T/C]GCCGGT


In [67]:
primers['Multiplex_core']['AlnID'] = primers['Multiplex_core']['TemplateID'] \
                                        .str.split('_').apply(lambda x: int(x[0]))
# vectorize primer search, excluded - alignment list is passed entirely
primers['Multiplex_core']['fp_deg'] = np.vectorize(get_primer, excluded=[0]) \
                                     (alns, primers['Multiplex_core']['AlnID'],
                                      primers['Multiplex_core']['ForwardPrimer(Fp)'])
primers['Multiplex_core']['rp_deg'] = np.vectorize(get_primer, excluded=[0]) \
                                     (alns, primers['Multiplex_core']['AlnID'],
                                      primers['Multiplex_core']['ReversePrimer(Rp)'], reverse=True)
primers['Multiplex_core']

Unnamed: 0,ID,TemplateID,ForwardPrimer(Fp),ReversePrimer(Rp),FpTm(Celsius degree),RpTm(Celsius degree),FpGC(%),RpGC(%),FpPos,RpPos,Penalty,AmpSize(bp),SNPs in primer?,AlnID,fp_deg,rp_deg
0,1,531_cons,acattagtgctttattacgtggt,catttctttggcactttctgat,56.87,56.1,34.78,36.36,18,207,1.9,190,,531,ACATTAGTGCTTTATTACGTGGT,CATTTCTTTGGCACTTTCTGAT
1,2,432_cons,acctgcattcgtatgtgttcta,ttcgaggtgggagattttcttc,58.12,58.65,40.91,45.46,5,217,0.65,213,,432,ACCTGCATTCGTATGTGTTCTA,TTCGAGGTGGGAGATTTTCTTC
2,3,473_var,tgctatggtacctaaacctacg,cgatgggtcaaacggaatcg,57.66,59.91,45.46,55.0,46,269,3.91,224,,473,TGCTATGGTACCTAAACCTACG,CGATGGGTCAAACGGAATCG
3,4,193_var,gccatgacgatatctacaatgc,gcgaacgctgattctaattttg,57.98,57.58,45.46,40.91,16,249,0.42,234,,193,GCCATGACGATATCTACAATGC,GCGAACGCTGATTCTAATTTTG
4,2,126_var,ttcaggagctcaccaaatcg,tccttcacctgttgtactgtat,58.48,57.08,50.0,40.91,123,314,0.92,192,low freq,126,TTCAGGAGCTCACCAAATCG,TCCTTCACCTGTTGTACTGTAT
5,3,416_var,aactggcactagtcatgtacat,caattaattccgttagcccgtg,57.77,58.63,40.91,45.46,19,212,0.63,194,low freq,416,AACTgGCACTAGTCATGTACAT,CAATTAATTCCGTTAGCCCGTG
6,4,495_var,ttcctcgcacactcatgtaag,tctcgatgtagaactgcacg,58.62,58.02,47.62,50.0,68,261,2.02,194,low freq,495,TTCCTCGCACACTCATGTAAG,TCTCGATGTAGAACTGCACg
7,5,435_var,agttctttggagagttcggaat,aagaaaagaagtgagaaagcga,57.97,56.82,40.91,36.36,48,242,1.18,195,low freq,435,AGTTCTTTGGAGAGTTCgGAAT,AAGAAAAGAAGTGAGAAAGCGA
8,6,283_var,cacacaacttttatgtttaatca,aaaataatggccactattccc,52.53,54.45,26.09,38.1,53,249,4.55,197,low freq,283,CAC-ACAACTTTTATGTTTAATCA,AAAaTAATGGCCACTATTCCC
9,10,205_cons,agacacaaggccttgaagaag,acagcttcttcttgtcatcgta,58.73,58.07,47.62,40.91,41,275,0.07,235,low freq,205,AGACACAAGGCCTTGAAGAAG,ACAGCTTCTTCTTGTCaTCGTA


In [69]:
primers['Additional markers to add']
primers['Additional markers to add']['AlnID'] = primers['Additional markers to add']['TemplateID'] \
                                        .str.split('_').apply(lambda x: int(x[0]))
# vectorize primer search, excluded - alignment list is passed entirely
primers['Additional markers to add']['fp_deg'] = np.vectorize(get_primer, excluded=[0]) \
                                     (alns, primers['Additional markers to add']['AlnID'],
                                      primers['Additional markers to add']['ForwardPrimer(Fp)'])
primers['Additional markers to add']['rp_deg'] = np.vectorize(get_primer, excluded=[0]) \
                                     (alns, primers['Additional markers to add']['AlnID'],
                                      primers['Additional markers to add']['ReversePrimer(Rp)'], reverse=True)
primers['Additional markers to add']

Unnamed: 0,ID,TemplateID,ForwardPrimer(Fp),ReversePrimer(Rp),FpTm(Celsius degree),RpTm(Celsius degree),FpGC(%),RpGC(%),FpPos,RpPos,Penalty,AmpSize(bp),SNPs in primer?,AlnID,fp_deg,rp_deg
0,2,77_var,gtctcggagcacatcgtg,tcgtacttcattattctttggactg,58.47,57.76,61.11,36.00,9,198,3.24,190,high freq,77,GTcTCgGAGCACAT[C/T]GTG,TCGTACTTCATTATTCTTTGGACTG
1,3,83_var,agtgactccagacggtgtt,ccaaggatttgctactaccact,58.76,58.37,52.63,45.46,11,200,0.37,190,high freq,83,AGTG[A/G]CTCCAGACGGTgTT,CC[A/T/G]AGGATTTGCTACTACCAcT
2,4,174_var,gtctcgaaaccgtacgtactaa,cactactactggcctcggata,58.25,58.73,45.46,52.38,39,228,1.73,190,high freq,174,GTCTCGAAACCGTA[C/A]GT[A/G]CTAA,CACTAcTACTGGCC[T/C]CGGATA
3,5,206_var,agacatatcaaccacaagcgta,acgaagctctgatatgtatcgt,58.12,57.48,40.91,40.91,14,204,0.52,191,high freq,206,AGACATATC[-/A]AA[C/A]-C[A/-]CAAGCGTA,a[C/T][G/T][A/T/G][A/G]GCTCTGATATGt[A/G]TCGT
4,6,31_var,tgtgtacggtctgaagaacatc,ttatccggctccaagttaagg,58.67,58.04,45.46,47.62,5,195,1.04,191,high freq,31,TGT[G/C]TACGGTCTGAAGAACATc,TT[A/G]TCCGGCTCCAAGTTAAGG
5,7,281_cons,ccttgtgaacactgttgaactt,agctgtttgaatcgaaaccttc,57.88,58.09,40.91,40.91,19,209,0.09,191,high freq,281,CCTTGTGAACACTGTTGAACTT,AgCTGTT[T/C]GAATCGAAACC[T/A]TC
6,9,220_var,ggtcgtaaccaaatgtccaaag,gagtgcactgaaaacctatgtg,58.42,58.17,45.46,45.46,14,205,0.17,192,high freq,220,gGT[C/T]GTAACCAA[A/G]TGTCCAAAG,GAGTGCACTGAAAACCTATGTG
7,11,418_cons,tgttacggttaccaaatatgct,ggattcttttcagaagacgctc,56.40,57.98,36.36,45.46,2,193,0.02,192,high freq,418,TGTTACG[G/A]TT[A/G]CCAAAtATgCT,GGATTCTTTTCAGAaGACG[C/T]TC
8,12,264_cons,cagatcctccgccttgatg,tccaagggtacgattgaaatca,58.27,58.03,57.90,40.91,23,214,0.03,192,high freq,264,CAGATC[C/T/G]TC[C/T/G]G[C/G]CTTGATg,TC[C/G]AAGGGTACgATTGAAATCA
9,13,112_cons,gtatgtagcggggtccattg,gtgctatcgttaagaggtcgta,58.70,58.04,55.00,45.46,9,201,0.03,193,high freq,112,G[T/A]ATGTAGCG[G/A/T]GGTCCATTG,GTGCTATCGTTAAGAGGTCGTA


In [75]:
writer = pd.ExcelWriter(OUT_FILE)
for (k,v) in primers.items():
    v.to_excel(writer, sheet_name=k)
writer.save()