In [1]:
import Bio
import pandas as pd
from Bio.Seq import Seq
from Bio.SeqUtils import gc_fraction, MeltingTemp as mt
from collections import Counter
from seqfold import dg, fold
import random
import re
import time

In [2]:
# generate as many umis as possible
# define the length
umilen = 16

# function to create umis
def create_umi(length = umilen):
    return ''.join(random.choices('ACGT', k=length))

# for a sequence of 16 nucleotides, the average count for nucleotide must be 4
# on average, counts should be 4!
# we can set 1 as the highest deviation from the average that we accept for the umi to be "equilibrated"
threshold =  1

# function to check if nucleotides are in equilibrium
def count_nns(umi, threshold):
    counts = Counter(umi).values()
    mean_count = sum(counts) / len(counts)
    
    # calculate deviations
    devs = [abs(count - mean_count) for count in counts]
    # check that all counts are within the deviation threshold
    if all(dev <= threshold for dev in devs):
        return umi
        
random.seed(666)

In [None]:
# we have randomly chosen umis that are roughly equilibrated
# now we need to check that the umi doesn't have more than N of the same nucleotide together

In [3]:
# homopolymers no longer than 0 == no homopolymers
N = 0
# number of homopolymers threshold
P = 0

def check_homopolymers(umiseq, N, P):
    # create empty variables to store nucleotides
    current_nn = None
    conseq = 0 # initiate the count of nucleotides from zero
    homseq = 0 # count for homopolymers
    longest = 0 # keeps track of the longest homopolymer

    # iterate through the umi sequence
    for nn in umiseq:
        if nn == current_nn:
            conseq += 1 # add one to the count if subsequent nucleotides after current are the same
            homseq += 1
            if conseq >= longest:
                longest = conseq
        else:
            # replace the variable with the next nucleotide if it is different
            conseq = 0 # resets the nucleotide count
        current_nn = nn  # Change to a new nucleotide

    # check if the homopolymer count exceeds the N threshold
    if homseq > P: # has at least one homopolymer
        return False  # Homopolymer of length greater than N found
    # filter by longest homopolymer
    elif longest > N:
        return False
    else:
        return umiseq  # No homopolymer longer than N found

In [4]:
# now, we need to check that the sequence is not complementary or within the target sequence 

def check_seq_match(umi, target):
    rev_target = Seq(target)[::-1]  # reverse target sequence
    rev_comp_target = Seq(target).reverse_complement()  # reverse complement target

    # check the sequences
    if str(umi) == str(target):
        return False, 'UMI is the same as the target sequence'
    elif str(umi) == str(rev_target):
        return False, 'UMI is the reverse of the target sequence'
    elif str(umi) == str(rev_comp_target):
        return False, 'UMI is the reverse complement of the target sequence'

    return umi, 'UMI is unique and does not match the target sequence'

In [None]:
# but we also want to check for partial matches

In [5]:
def check_partial_matches(umi, target, min_match = 6):
    # Generate reverse and reverse complement of the target sequence
    rev_target = Seq(target)[::-1]  # Reverse of the target sequence
    rev_comp_target = Seq(target).reverse_complement()  # Reverse complement using BioPython
    
    # Function to check for partial matches in a given target sequence
    def find_partial_matches(umi, target, min_match):
        umi_len = len(str(umi))
        target_len = len(str(target))

        # function to iterate through nucleotides from zero to the target's length minus match length plus one
        # that is the last kmer of length min_match + 1
        for i in range(target_len - min_match + 1):
            # iterates through nucleotides from the first kmer to the length of the umi plus +
            for j in range(min_match, umi_len + 1):
                # checks if umi in target and vice versa
                if target[i:i+j] in umi or umi in target[i:i+j]:
                    return 'Partial match', target[i:i+j] # there's a match
        # this umi is returning the sequence
        return 'No match of length %s'%(min_match), 'comparing: %s'%(target[i:i+j])
    
    # check partial matches of umi on the target, reverse, and reverse complement
    # using the iterating function above
    for seq in [target, str(rev_target), str(rev_comp_target)]:
        umitemp, match_sequence = find_partial_matches(umi, seq, min_match)
        # if there is a match, the match_sequence string will contain text and the matching sequences
        # thus, it would be longer than the minimum match length allowed
        if len(match_sequence) > min_match:
            return False, f'Partial match found: {match_sequence} in sequence {seq}'
    return umi, 'No significant partial matches found'

In [6]:
# test the tree functions
test = 'AAACCCTTTGGGATCG'
test1 = 'TACGTACGTACGTACG'

target = 'AAACCCTTTGGGATCG'
print(check_homopolymers(test, N, P))
print(check_homopolymers(test1, N, P))

print(check_seq_match(test,target))
print(check_seq_match(test1,target))

print(check_partial_matches(test,target, 5))
print(check_partial_matches(test1,target, 5))

umipass, statement = check_partial_matches(test1,target, 5)
print(umipass)

False
TACGTACGTACGTACG
(False, 'UMI is the same as the target sequence')
('TACGTACGTACGTACG', 'UMI is unique and does not match the target sequence')
(False, 'Partial match found: comparing: CCAAA in sequence GCTAGGGTTTCCCAAA')
(False, 'Partial match found: comparing: GATCG in sequence AAACCCTTTGGGATCG')
False


In [7]:
target = 'CACACCTCGGTGTGAAGCAAATGATTGTTGCTATTAACAAGATGGACGACAAATCTGTCAACTGGGCACAATCTAGGTACGATGAAATAGTTAAGGAAGTATCCTCCTTTGTCAAGAAGATCGGCTACAACCCTGAGAAGATCCCGTTCGTCCCCATTTCTGGTTGGCACGGAGATAACATGCTCGAGAAGTCATCTAACCTCTCATGGTATAAAGGCCCCACATTACTCGAGGCCCTCGACTCTGTGTCAGAACCCAAGAGACCAACGGAAAAGCCCCTCCGAATTCCCCTTCAAGATGTTTACAAGATTGGAGGTATTGGAACTGTGCCTGTAGGCCGTGTTGAGACTGGGGTTCTCAAACCAGGAATGAACGTTACTTTCTCCCCTGCTGGTTTGACCACTGAAGTTAAGTCTGTTGAGATGCACCACGTCTCTCTCCCTGAGGCTGTCCCAGGTGATAACGTTGGTTTCAATGTCAAGAATCTGTCAGTTAAGGATATTCGTCGTGGTATGGTTGCTGGTGATGCCAAGAATGATCCCCCTCAAGAGACTGAAGATTTTAATGCCCAAGTTATTATTCTCAATCACCCTGGTCAGATCCATGCAGGATATGCCCCAGTGCTTGATTGTCACACTGCTCATATCGCCTGTAAGTTCAGCGAGATTCTCTCCAAAGTAGATCGTAGATCTGGTCAAGAGACTGAGGCTGCCCCTAAGAATATCAAGAACGGAGATGCCCGCCATAGTTAAACTCACTCCCTCCCAAGGCCCATGTGTGTGGAGTCTTTCTCTGATTACCCACCCCCTTGG'
testlist = ['CTGCTGCAGAATACAT', 'TACACTGCGCAGCTAT', 'CAGGAGTAACTACTGC', 'GTACGCCCAGCATATG', 'CGTATCTAGGTAACAT', 'TGATGGACTCACCAGC', 'GTCTGCGTGATTAACA', 'GAACTAGCTGCAGGCT', 'TATGCTAGACAGGTCA', 'GAGTACATCGCTCAGT']

newlist = []
for umi in testlist:
    check_seq_match(umi, target)
    umibool, statement = check_seq_match(umi, target)
    # print('step1', umibool, statement)
    if umibool != False: 
        # print('step2', umibool, statement)
        if umibool not in newlist and umibool != True:
            # print('umibool', umibool)
            newlist.append(umibool)
            
print('Finding UMIs with no full match:')
print('Remaining UMIs %s'%(len(newlist)))
print(newlist[0:10], '\n')


newlist1 = []
for umi in newlist:
    umibool, statement = check_partial_matches(umi, target, min_match = 6)
    # print('step1', umibool, statement)
    if umibool != False:
        # print('step2', umibool, statement)
        if umibool not in newlist1:
            # print('umibool', umibool)
            newlist1.append(umibool)
            
print('UMIs with no partial match')
print('Remaining UMIs %s'%(len(newlist1)))
print(newlist1[0:10], '\n')


Finding UMIs with no full match:
Remaining UMIs 10
['CTGCTGCAGAATACAT', 'TACACTGCGCAGCTAT', 'CAGGAGTAACTACTGC', 'GTACGCCCAGCATATG', 'CGTATCTAGGTAACAT', 'TGATGGACTCACCAGC', 'GTCTGCGTGATTAACA', 'GAACTAGCTGCAGGCT', 'TATGCTAGACAGGTCA', 'GAGTACATCGCTCAGT'] 

UMIs with no partial match
Remaining UMIs 2
['CTGCTGCAGAATACAT', 'TGATGGACTCACCAGC'] 



In [184]:
# implement the tree functions
# assume the following target barcode in Fungi
# TEF1α region Al33F (5′-GAYTTCATCAAGAACATGAT-′3) and Al33R (5′-GACGTTGAADCCRACRTTGTC-′3)
# from https://cdnsciencepub.com/doi/10.1139/gen-2018-0083
# create umis

# Trichia persimili, a myxomicetes (group found in deadwood, doesnt mean this spp is saproxylic)
# GenBank: FJ546692.1
target = 'CACACCTCGGTGTGAAGCAAATGATTGTTGCTATTAACAAGATGGACGACAAATCTGTCAACTGGGCACAATCTAGGTACGATGAAATAGTTAAGGAAGTATCCTCCTTTGTCAAGAAGATCGGCTACAACCCTGAGAAGATCCCGTTCGTCCCCATTTCTGGTTGGCACGGAGATAACATGCTCGAGAAGTCATCTAACCTCTCATGGTATAAAGGCCCCACATTACTCGAGGCCCTCGACTCTGTGTCAGAACCCAAGAGACCAACGGAAAAGCCCCTCCGAATTCCCCTTCAAGATGTTTACAAGATTGGAGGTATTGGAACTGTGCCTGTAGGCCGTGTTGAGACTGGGGTTCTCAAACCAGGAATGAACGTTACTTTCTCCCCTGCTGGTTTGACCACTGAAGTTAAGTCTGTTGAGATGCACCACGTCTCTCTCCCTGAGGCTGTCCCAGGTGATAACGTTGGTTTCAATGTCAAGAATCTGTCAGTTAAGGATATTCGTCGTGGTATGGTTGCTGGTGATGCCAAGAATGATCCCCCTCAAGAGACTGAAGATTTTAATGCCCAAGTTATTATTCTCAATCACCCTGGTCAGATCCATGCAGGATATGCCCCAGTGCTTGATTGTCACACTGCTCATATCGCCTGTAAGTTCAGCGAGATTCTCTCCAAAGTAGATCGTAGATCTGGTCAAGAGACTGAGGCTGCCCCTAAGAATATCAAGAACGGAGATGCCCGCCATAGTTAAACTCACTCCCTCCCAAGGCCCATGTGTGTGGAGTCTTTCTCTGATTACCCACCCCCTTGG'

umi_raw = []
# to generate 96 forward and 96 reverse umis
# and check for nucleotide composition balance
# 4**16 is as many combinations as we can get =  4294967296
# 100000 is the number of initial random tags, you can reduce or increase the number if it takes too long
# or if no enough umis pass the test
while len(umi_raw) <= 100000:
    umi = create_umi()
    umi_pass1 = count_nns(umi, threshold)
    
    # add the umi to the list if it is not already there
    if umi_pass1 not in umi_raw and umi_pass1 != None:
        umi_raw.append(umi_pass1)
print('Raw UMIs created: %s'%(len(umi_raw)))
print(umi_raw[0:10], '\n')

# check umis for homopolymers

N = 2 # allow the longest homopolymer to have two nucleotides
P = 2 # allow a max of one homopolymer

umi_uniq = []

for umi in umi_raw:
    if check_homopolymers(umi, N, P) is False:
        pass
    else:
        if umi not in umi_uniq:
            umi_uniq.append(umi)
print('Non-homopolymer UMIs with homopolymer number threshold %s and nucleotide count threshold %s'%(P,N))
print('Remaining UMIs %s'%(len(umi_uniq)))
print(umi_uniq[0:10], '\n')

# check for matches
umi_nofmatch = []

for umi in umi_uniq:
    check_seq_match(umi, target)
    umibool, statement = check_seq_match(umi, target)
    if umibool != False: 
        if umibool not in umi_nofmatch and umibool != True:
            umi_nofmatch.append(umibool)
print('UMIs with no full match')
print('Remaining UMIs %s'%(len(umi_nofmatch)))
print(umi_nofmatch[0:10], '\n')

# check for partial matches
umi_preselected = []

for umi in umi_nofmatch:
    umibool, statement = check_partial_matches(umi, target, min_match = 6)
    if umibool != False:
        if umibool not in umi_preselected:
            umi_preselected.append(umibool)
            
print('UMIs with no partial match')
print('Remaining UMIs %s'%(len(umi_preselected))) # number of umis that pass all tests
umi_preselected[0:10] # prints the first 10 in the list

Raw UMIs created: 1000001
['TTGATCGAGCCACTGG', 'GGTAAAACACGTCGTT', 'ATGAGTAGATCCCCGA', 'ACATAAGTTCAGGTCC', 'CTAAAGGGTCTTCGTG', 'TAATCCGCAAGGGAGT', 'TCGTCTAGCTCGAAAA', 'TCAACGGCTAGCGGTC', 'AATACTCCCTGAGGAG', 'TATCAGGCCGAAACTC'] 

Non-homopolymer UMIs with homopolymer number threshold 2 and nucleotide count threshold 2
Remaining UMIs 351031
['CGATATGAGCAATGCC', 'TATCATGATCTCAGGA', 'GCACTCATGTGAGCAG', 'AGACTCTGACTTGTAA', 'GGACGTGTACATCATG', 'ATAGCTGCTGTTCACA', 'ACGCGTCTGAAGTAGT', 'GAGGTGCAGATACTTC', 'TTAGTCTAGTCAAGCA', 'AGACCGGTACTAGATG'] 

UMIs with no full match
Remaining UMIs 351031
['CGATATGAGCAATGCC', 'TATCATGATCTCAGGA', 'GCACTCATGTGAGCAG', 'AGACTCTGACTTGTAA', 'GGACGTGTACATCATG', 'ATAGCTGCTGTTCACA', 'ACGCGTCTGAAGTAGT', 'GAGGTGCAGATACTTC', 'TTAGTCTAGTCAAGCA', 'AGACCGGTACTAGATG'] 

UMIs with no partial match
Remaining UMIs 142435


['TATCATGATCTCAGGA',
 'GCACTCATGTGAGCAG',
 'AGACTCTGACTTGTAA',
 'ATAGCTGCTGTTCACA',
 'TTAGTCTAGTCAAGCA',
 'GACACACCGTTAGTGT',
 'AACTCGTCATGTGGTA',
 'TCATGTACCTGACGAC',
 'AATGCATACTCGAGCG',
 'CAGTGTGTCAACACGG']

In [276]:
# all sequences pooled should start with the same nucleotide to prevent biases during sequencing and amplification
# see 10.1038/s41579-018-0116-y and 10.1111/mec.16460
# count how many umis you have per nucleotide (first nucleotide)

first_nn = [umi[0] for umi in umi_preselected]
print(len(first_nn)) # should be the same number of umis in the pre-selected list

print(Counter(first_nn))

# selecting the nucleotide with the highest counts maximizes the number of tagged primers
# that pass the thresholds

umis_selected = [umi for umi in umi_preselected if umi[0] == 'C']
print(len(umis_selected))

142435
Counter({'C': 36644, 'G': 36122, 'A': 35044, 'T': 34625})
36644


In [310]:
# finally, calculate GC content and annealing temperature
# for the remaining 142435 umis (we need 96*2)
# we can check the first 1000 or so umis
# we will select those with compatible temperatures and good CG content

padd = 'GGTAG'
primer_forward = 'GAYTTCATCAAGAACATGAT'
primer_reverse = 'GACGTTGAADCCRACRTTGTC'
gc_min = 47
gc_max = 53
# based on https://www.tandfonline.com/doi/full/10.1080/15572536.2006.11832842#d1e421 - FUNGI!
# no more than 5 degree difference in annealing temperatures between primer pairs
temp_min = 62
temp_max = 67

# how many tagged primers to generate
# e.g. if num_primers = 96, it will generate 96 forward and 96 reverse
num_primers = 96

# creates the table to store information
tagged_primers = pd.DataFrame(columns = ['sense','umi_seq','padd_seq','primer_seq','full_primer','gc_content','temp'])

# to make sure no same umi is used for two primers within one pair:
# i.e. one umi used in a forward primer cannot longer be used for a forward.
umi_track = []

# initiate counters to generate just the primers necessary
f_count = 0
for umi in umis_selected: # umi_preselected[0:20000] if you want to test things fast, you can select the first e.g. 20000 rows
    # padd should be between umi and primer to prevent the umi from acting as a primer
    # see 10.1038/s41579-018-0116-y and 10.1111/mec.16460
    if umi not in umi_track:
        fullseq = umi + padd + primer_forward
        # print(fullseq)
        gc_content = gc_fraction(fullseq) * 100
        # print(gc_content)
        temp = mt.Tm_NN(fullseq)
        # print(temp)
        sense = 'forward' # python is zero indexed

        # add thresholds if you wanna keep padd+umi+primer temperatures that overlap the
        # PCR protocol annealing temperatures already
        # and to keep the GC content close to 50% - second filter for that
        # greater than or equal AND lesser then or equal to
        if gc_min <= gc_content <= gc_max and temp_min <= temp <= temp_max:
            row = [sense, umi, padd, primer_forward, fullseq, gc_content, temp]
            tagged_primers.loc[len(tagged_primers)] = row # appends the row
            umi_track.append(umi)
            # add to the counters
            f_count += 1
            # print('passed %s'%(f_count))
            if f_count == num_primers:
                break
print('done with forward primers')
        
r_count = 0
for umi in umis_selected: # umi_preselected[0:20000] if you want to test things fast, you can select the first e.g. 20000 rows
    if umi not in umi_track:
        fullseq = umi + padd + primer_reverse
        gc_content = gc_fraction(fullseq) * 100
        temp = mt.Tm_NN(fullseq)
        sense = 'reverse' # python is zero indexed
        
        if gc_min <= gc_content <= gc_max and temp_min <= temp <= temp_max:
            row = [sense, umi, padd, primer_reverse, fullseq, gc_content, temp]
            tagged_primers.loc[len(tagged_primers)] = row # appends the row
            umi_track.append(umi)
            r_count += 1
            if r_count == num_primers:
                break
    
print('done with reverse primers')
            
print('Umis generated under the thresholds: %s'%(len(tagged_primers)))
print('Of which %s are forward and %s are reverse'%(len(tagged_primers[tagged_primers['sense'] == 'forward']),
                                                    len(tagged_primers[tagged_primers['sense'] == 'reverse'])))
tagged_primers.head(10)

done with forward primers
done with reverse primers
Umis generated under the thresholds: 192
Of which 96 are forward and 96 are reverse


Unnamed: 0,sense,umi_seq,padd_seq,primer_seq,full_primer,gc_content,temp
0,forward,CGAAGACTCGCGTCGT,GGTAG,GAYTTCATCAAGAACATGAT,CGAAGACTCGCGTCGTGGTAGGAYTTCATCAAGAACATGAT,47.5,66.082963
1,forward,CGAGTCTTCAGCGACG,GGTAG,GAYTTCATCAAGAACATGAT,CGAGTCTTCAGCGACGGGTAGGAYTTCATCAAGAACATGAT,47.5,65.819301
2,forward,CGGCACAGAGTTCTGC,GGTAG,GAYTTCATCAAGAACATGAT,CGGCACAGAGTTCTGCGGTAGGAYTTCATCAAGAACATGAT,47.5,66.254998
3,forward,CTCGCTGTAGAGCCAG,GGTAG,GAYTTCATCAAGAACATGAT,CTCGCTGTAGAGCCAGGGTAGGAYTTCATCAAGAACATGAT,47.5,65.432762
4,forward,CGATCTGGAGTCGACC,GGTAG,GAYTTCATCAAGAACATGAT,CGATCTGGAGTCGACCGGTAGGAYTTCATCAAGAACATGAT,47.5,65.423528
5,forward,CAGAGCGCCAGAGACC,GGTAG,GAYTTCATCAAGAACATGAT,CAGAGCGCCAGAGACCGGTAGGAYTTCATCAAGAACATGAT,50.0,66.922446
6,forward,CGCAGCACTGGTTGCA,GGTAG,GAYTTCATCAAGAACATGAT,CGCAGCACTGGTTGCAGGTAGGAYTTCATCAAGAACATGAT,47.5,66.692364
7,forward,CGGTGTCTACACACGG,GGTAG,GAYTTCATCAAGAACATGAT,CGGTGTCTACACACGGGGTAGGAYTTCATCAAGAACATGAT,47.5,65.657744
8,forward,CGGTGCACAGTCAGTC,GGTAG,GAYTTCATCAAGAACATGAT,CGGTGCACAGTCAGTCGGTAGGAYTTCATCAAGAACATGAT,47.5,66.078574
9,forward,CGCTCGTGGATCGACA,GGTAG,GAYTTCATCAAGAACATGAT,CGCTCGTGGATCGACAGGTAGGAYTTCATCAAGAACATGAT,47.5,65.855245


In [337]:
# before saving the primers, we need to test if they can form hairpin structures that would affect PCR

for row in tagged_primers.itertuples():
    # print(row.full_primer)
    # replace ambiguous nucleotides
    for nn in ['A','T','C','G']:
        # ambiguous nucleotides include
        seq = re.sub('[NRYKMSWBDHV]',nn, row.full_primer, flags = re.IGNORECASE)
        # seq = row.full_primer.replace('Y',nn)
        # predict secondary structures
        structures = fold(seq) # doesn't like ambiguous nucleotides
        for struct in structures:
            # concerning Delta Gs are the negative ones (indicate estability of the structure)
            # also worrying are hairpin structures that are long, two nucleotides is probably fine
            # also worrying if the hairpin happens at the 3' end, that affects primer annealing
            if 'HAIRPIN' in str(struct.desc) and struct.e < 0:
                print(f"Type: {struct.desc}, Position: {struct.ij}, Delta G: {struct.e:.2f} kcal/mol")
                print(f"Concerning primer sequence: {row.full_primer.replace}")
print('All primers checked')    

All primers checked


In [284]:
# save the file!
# TEF1α region Al33F (5′-GAYTTCATCAAGAACATGAT-′3) and Al33R (5′-GACGTTGAADCCRACRTTGTC-′3)
tagged_primers.to_csv('./TEFa_Al33F-Al33R_tagged.csv', sep = '\t', index_label = 'index')

# ITS metabarcoding for ENTWINE

**sampling**

9 sites and 6 trees on each site: 54 deadwood logs 

54 logs + 3 controls (fieldwork, extraction/amplification, sequencing) + sampled 3 times a year for 2 years:  342 deadwood samples to amplify. Give or take, 400 different primer combinations. I need to find two numbers that allow for 400 combinations (to minimise costs)

**primer choice**

Based on [Nilsson et al., 2019](10.1038/s41579-018-0116-y) and [Tedersoo et al., 2022](10.1111/mec.16460):

I would select ITS9MUNngs because it targets **the whole ITS** region (one and two) and can identify 100% of fungi (although bind to other eukaryotes too). For reverse, I would select LR5F as it doesnt bind to plants and can bind to 99% of fungi. Other primers targeting only fungi can bind to only 95% taxa. If I need an “internal” primer,  I will choose gITS7ngs because targets ITS2 (more recommended than ITS1 by the paper) and can bind to 100% fungi, plus other plants and protists. Reverse can be the same. The problem is that the reverse primer combined with the gITS7ngs has more than 5 5-degree annealing temperature difference which is not recommended. However, choosing ITS4ngUNI (other recommended reverse) has the same 5-degree annealing temperature difference as ITS9MUNngs. I’ll stick to ITS9MUNngs+LR5F for most samples and for anything else failing I will use the gITS7ngs as a backup for ITS2 at least, despite the 5-degree difference.

Given that I want to have 400 unique combinations and that I will use the same reverse primer (but two different forward primers), a good option is to go for:

- 10 forward and 40 reverse give 400 combinations. With twice as many forward primers (20): synthesising a total of 60 primers  

- **16 forward and 25 reverse give 400 combinations. With twice as many forward primers (32): synthesising a total of 57 primers**  

- 20 forward and 20 reverse give 400 combinations. With twice as many forward primers (40): synthesising a total of 60 primers  

As a backup region, I will choose LSU because it is a longer fragment and is less conserved across fungi than **SSU**

In [8]:
# number of primer combinations that should be unique
number = 400

# Function to find factor pairs
def find_factor_pairs(num):
    factor_pairs = []
    for i in range(1, int(num**0.5) + 1):
        if num % i == 0:
            factor_pairs.append((i, num // i))
    return factor_pairs

factor_pairs = find_factor_pairs(number)

for pair in factor_pairs:
    print(f"The numbers {pair[0]} and {pair[1]} multiplied give {number}")
    print(f"synthesising a total of {pair[0]+pair[1]} primers")

The numbers 1 and 400 multiplied give 400
synthesising a total of 401 primers
The numbers 2 and 200 multiplied give 400
synthesising a total of 202 primers
The numbers 4 and 100 multiplied give 400
synthesising a total of 104 primers
The numbers 5 and 80 multiplied give 400
synthesising a total of 85 primers
The numbers 8 and 50 multiplied give 400
synthesising a total of 58 primers
The numbers 10 and 40 multiplied give 400
synthesising a total of 50 primers
The numbers 16 and 25 multiplied give 400
synthesising a total of 41 primers
The numbers 20 and 20 multiplied give 400
synthesising a total of 40 primers


In [9]:
# Trichia striolata 5.8S ribosomal RNA gene, partial sequence; internal transcribed spacer 2, complete sequence; and 28S ribosomal RNA gene, partial sequence
# GenBank: AY014124.1

target = 'ATTGCAGAACACATTGAACATCGACATCTTGAACGCACATGGCGGCCTCGGGTCCATCCCGAGGCCACGCCCGTCTGAGGGTCGGCGAGTCAGACAAGCTATCGCCTCCAATTCTTTGGCCAGCAGGTCCCGTGCCTGAAAGCGATTTCCTTCCGCCGGTCTCTTCGCCCCCTTTCTCGCTCTGRAAGGGACGATTAGATGGGAGGATTCGCCGTATGGTCGGTACCGTGGAGGCGCTTTGGGTTTTCGCGGATCTCATGTCGATCCCCGTGACTTCAAGTTTACGCCGCGCCGTCCGCAGTCTCGCAGGCTTCGGGTGCAAGGAACAGCTCTGACCCGAAAATATATACAGCTCTGGTTTATTTTCGCGGTCTTCCGCGCACCTTGACCAGGGAGGCAGCAATTTCTCTTGACCTACCGACGCCGCCGCCGACGTGCTGGACTCGGATTGATCGGATATCATCGAGCCGACGGCGTGCTGGACGGCAGAGGGGGGAAAAGTGAAAAGTGGCGACGCCCACAGGTTATGGGAAGGTGAGCGCGCGTTACCGTGTTCGCTTTCCTGTATCCGACCTCAGATCGGACGAGATTACCCGCTGAATTTAAGCATATAACTAAGCGGAGGAAAAGAAACTAACAAGGATTCCCCCAGTAACGGCGAGTGAAGCGGGAATAGCCCAGCACCGAATCCCTCAGTGTCATGCTGACGGGAACTGTGGTGTGTGGGACGCCACCAGTCGCATCAGAGGGCGCCGAAGTCCTCCTGATCGGGGCTTCACCCAGAGCGGGTGTAAGGCCTTTGCGGGTGCCTCTCTGTGCGGCCGCGAGCGTCTCAGGAGTCGGGTTGTTTGGGAATGCAGCCCAAAGCGGGTGGTAAACTCCATCTAAGGCTAAATACGTGCACGAGTCCGATAGCGGACAAGTACCGTGAGGGAAAGTTGAAAAGAACTTTGAAGAGAGAGTTCAAGAGTACGTGAAACCGCCCAGAGGTAAACGGGTGGATCCGCAAAGTCGGCCCGCGGAATTCAGCGCAGCGCGCAGACCTGGGCTGTCGCAGTTCGAGATCCCTGGGACTCAGCCGCGACGTCGTTCGGGTCTCCGCTGCGTGCACTTTCCGCGGGCAGAGTGCCACAACCGGTTCGGCGGCGGCTACATGCTTGGAGGGTTGTAGGTGGGGGCGCTTGCGTCCCTACCAGCCCTTCTTTCGCGAGCCGCTGGCCGGACCGAGGGAACGCCGCGCGCTTCGAGGCCACCCTCCCCTCCTGGTGAGTTCGACTGGGAGAGACTGGGCAACCGTGTCTCCCGACCGCTCATCGGCGATCGGCGTGGGGCTGGCCGGGCGTGCATTGTCTGTCAAGGTTGGTGGCAAGTATGTCGGCATTCCACCCGACCCGTCTTGAA'

umi_raw = []
while len(umi_raw) <= 100000:
    umi = create_umi()
    umi_pass1 = count_nns(umi, threshold)
    if umi_pass1 not in umi_raw and umi_pass1 != None:
        umi_raw.append(umi_pass1)
print('Raw UMIs created: %s'%(len(umi_raw)))
print(umi_raw[0:10], '\n')

# check umis for homopolymers
N = 2 # allow the longest homopolymer to have two nucleotides
P = 2 # allow a max of one homopolymer

umi_uniq = []

for umi in umi_raw:
    if check_homopolymers(umi, N, P) is False:
        pass
    else:
        if umi not in umi_uniq:
            umi_uniq.append(umi)
print('Non-homopolymer UMIs with homopolymer number threshold %s and nucleotide count threshold %s'%(P,N))
print('Remaining UMIs %s'%(len(umi_uniq)))
print(umi_uniq[0:10], '\n')

# check for matches
umi_nofmatch = []

for umi in umi_uniq:
    check_seq_match(umi, target)
    umibool, statement = check_seq_match(umi, target)
    if umibool != False: 
        if umibool not in umi_nofmatch and umibool != True:
            umi_nofmatch.append(umibool)
print('UMIs with no full match')
print('Remaining UMIs %s'%(len(umi_nofmatch)))
print(umi_nofmatch[0:10], '\n')

# check for partial matches
umi_preselected = []

for umi in umi_nofmatch:
    umibool, statement = check_partial_matches(umi, target, min_match = 6)
    if umibool != False:
        if umibool not in umi_preselected:
            umi_preselected.append(umibool)
            
print('UMIs with no partial match')
print('Remaining UMIs %s'%(len(umi_preselected))) # number of umis that pass all tests
umi_preselected[0:10] # prints the first 10 in the list

Raw UMIs created: 100001
['TGGCGTATCAACATTC', 'TGGAATCATCTACCGG', 'GACCTATTCCAAGGAC', 'CGAATTCACTGTGGTC', 'ATGCCAGTGTGTACTC', 'GTACAGACTCGGACAT', 'AGTTTTAGCCCAGCTA', 'TGTCCTCTAGATGAGA', 'GAATTATCCTGGGGCA', 'GGTTCCCTACGATACA'] 

Non-homopolymer UMIs with homopolymer number threshold 2 and nucleotide count threshold 2
Remaining UMIs 35219
['ATGCCAGTGTGTACTC', 'GTACAGACTCGGACAT', 'TGTCCTCTAGATGAGA', 'CCAGTCGCGTAATCGT', 'AGGTCACCTAGTCAGA', 'CATGACACTGTCGTAT', 'TACGCTTTACGAGTAC', 'GCTTCATGAGCGAGCA', 'TAGCGCCGAGACTGAT', 'CGCACGCTTAGGACTA'] 

UMIs with no full match
Remaining UMIs 35219
['ATGCCAGTGTGTACTC', 'GTACAGACTCGGACAT', 'TGTCCTCTAGATGAGA', 'CCAGTCGCGTAATCGT', 'AGGTCACCTAGTCAGA', 'CATGACACTGTCGTAT', 'TACGCTTTACGAGTAC', 'GCTTCATGAGCGAGCA', 'TAGCGCCGAGACTGAT', 'CGCACGCTTAGGACTA'] 

UMIs with no partial match
Remaining UMIs 26673


['ATGCCAGTGTGTACTC',
 'GTACAGACTCGGACAT',
 'TGTCCTCTAGATGAGA',
 'CCAGTCGCGTAATCGT',
 'AGGTCACCTAGTCAGA',
 'CATGACACTGTCGTAT',
 'TACGCTTTACGAGTAC',
 'GCTTCATGAGCGAGCA',
 'TAGCGCCGAGACTGAT',
 'CCGTCGAACTGATGCA']

In [11]:
first_nn = [umi[0] for umi in umi_preselected]
print(len(first_nn)) # should be the same number of umis in the pre-selected list
print(Counter(first_nn))

umis_selected = [umi for umi in umi_preselected if umi[0] == 'A']
print(len(umis_selected))

26673
Counter({'A': 6902, 'T': 6844, 'C': 6503, 'G': 6424})
6902


### ITS9MUNngs and LR5F primers

In [None]:
# Fusarium verticillioides isolate CoPTY14.5_LSU large subunit ribosomal RNA gene, partial sequence
# GenBank: PP857690.1
target = 'GGGATTGCCCTAGTAACGGCGAGTGAAGCGGCAACAGCTCAAATTTGAAATCTGGCTCTCGGGCCCGAGTTGTAATTTGTAGAGGATACTTTTGATGCGGTGCCTTCCGAGTTCCCTGGAACGGGACGCCATAGAGGGTGAGAGCCCCGTCTGGTTGGATGCCAAATCTCTGTAAAGTTCCTTCGACGAGTCGAGTAGTTTGGGAATGCTGCTCTAAATGGGAGGTATATGTCTTCTAAAGCTAAATACCGGCCAGAGACCGATAGCGCACAAGTAGAGTGATCGAAAGATGAAAAGCACTTTGAAAAGAGAGTTAAAAAGTACGTGAAATTGTTGAAAGGGAAGCGTTTATGACCAGACTTGGGCTTGGTTAATCATCTGGGGTTCTCCCCAGTGCACTTTTCCAGTCCAGGCCAGCATCAGTTTTCGCCGGGGGATAAAGACTTCGGGAATGTGGCTCTCTTCGGGGAGTGTTATAGCCCGTTGTGTAATACCCTGGCGGGGACTGAGGTTCGCGCATCTGCAAGGATGCTGGCGTAATGGTCATCAACGACCCGTCTTGAAACACGGACCAAGGAGTCGTCTTCGTATGCGAGTGTTCGGGTGTCAAACCCCTACGCGTAATGAAAGTGAACGCAGGTGAGAGCTTCGGCGCATCATCGACCGATCCTGATGTTCTCGGATGGATTTGAGTAAGAGCATACGGGGCCGGACCCGAAAGAAGGTGAACTATGCCTGTATAGGGTGAAGCCAGAGGAAACTCTGGTGGAGGCTCGCAGCGGTTCTGACGTGCAAATCGATCGTCAAATATGGGCATGGGGGCGAAAGACTAATCGAACCTTCTAGTAGCTGGTTTCCGCCGAAGTTTCCCTCAGGATAGCAGTGTTGAACTCAGTTTTATGAGGTAAAGCGAATGATTAGGGACTCGGGGGCGCTATTTAGCCTTCATCCATTCTCAAACTTTAAATATGTAAGAAGCTCTTGTTGCTTAATTGAACGTGAGCATTCGAATGTATCAACACTAGTGGGCCATTTTTGGTAAGCAGAACTGGCGATGCGGGATGAACCGAACGCGAGGTTAAGGTGCCAGAGTAGACGCTCATCAGACACCACAAAAGGTGTTAGTACATCTTGACAGCAGGACGGTGGCCATGGAAGTCGGAATCCGCTAAGGACTGTGTAACAACTCACCTGCCGAATGTACTAGCCCTGAAAATGGATGGCGCTCAAGCGTCTCACCCATACCTCGCCCTCAGGGTAGAAACGATGCCCTGAGGAGTAGGCGGACGTGGAGGTCAGTGACGAAGCCTAGGG'

umi_raw = []
while len(umi_raw) <= 100000:
    umi = create_umi()
    umi_pass1 = count_nns(umi, threshold)
    if umi_pass1 not in umi_raw and umi_pass1 != None:
        umi_raw.append(umi_pass1)
print('Raw UMIs created: %s'%(len(umi_raw)))
print(umi_raw[0:10], '\n')

# check umis for homopolymers
N = 2 # allow the longest homopolymer to have two nucleotides
P = 2 # allow a max of one homopolymer

umi_uniq = []

for umi in umi_raw:
    if check_homopolymers(umi, N, P) is False:
        pass
    else:
        if umi not in umi_uniq:
            umi_uniq.append(umi)
print('Non-homopolymer UMIs with homopolymer number threshold %s and nucleotide count threshold %s'%(P,N))
print('Remaining UMIs %s'%(len(umi_uniq)))
print(umi_uniq[0:10], '\n')

# check for matches
umi_nofmatch = []

for umi in umi_uniq:
    check_seq_match(umi, target)
    umibool, statement = check_seq_match(umi, target)
    if umibool != False: 
        if umibool not in umi_nofmatch and umibool != True:
            umi_nofmatch.append(umibool)
print('UMIs with no full match')
print('Remaining UMIs %s'%(len(umi_nofmatch)))
print(umi_nofmatch[0:10], '\n')

# check for partial matches
umi_preselected = []

for umi in umi_nofmatch:
    umibool, statement = check_partial_matches(umi, target, min_match = 6)
    if umibool != False:
        if umibool not in umi_preselected:
            umi_preselected.append(umibool)
            
print('UMIs with no partial match')
print('Remaining UMIs %s'%(len(umi_preselected))) # number of umis that pass all tests
umi_preselected[0:10] # prints the first 10 in the list

In [22]:
# finally, calculate GC content and annealing temperature
# for the remaining 142435 umis (we need 96*2)
# we can check the first 1000 or so umis
# we will select those with compatible temperatures and good CG content

# blasting this padd against internal transcribed spacer sequences from fungi (core_nt)
# return no significant matches (too short of a sequence)

# the selection of pad sequences affect GC content and annealing temperatures
# AT have fewer bonds and split at lower temperatures
# this pad is not in the target sequence (at least ITS2 and surrounding regions) nor in the primers

padd = 'ATGAC'

primer_forward = 'TACACACCGCCCGTCG'
primer_reverse = 'CGATCGATTTGCACGTCAGA'

gc_min = 46
gc_max = 54

# no more than 5 degree difference in annealing temperatures between primer pairs
temp_min = 55
temp_max = 67

# how many tagged primers to generate
# e.g. if num_primers = 96, it will generate 96 forward and 96 reverse
num_primers = 25 # max number, we can select the "first 16 forward"

# creates the table to store information
tagged_primers = pd.DataFrame(columns = ['sense','umi_seq','padd_seq','primer_seq','full_primer','gc_content','temp'])

# to make sure no same umi is used for two primers within one pair:
# i.e. one umi used in a forward primer cannot longer be used for a forward.
umi_track = []

# initiate counters to generate just the primers necessary
f_count = 0
for umi in umis_selected: # umi_preselected[0:20000] if you want to test things fast, you can select the first e.g. 20000 rows
    # padd should be between umi and primer to prevent the umi from acting as a primer
    # see 10.1038/s41579-018-0116-y and 10.1111/mec.16460
    if umi not in umi_track:
        fullseq = umi + padd + primer_forward
        # print(fullseq)
        gc_content = gc_fraction(fullseq) * 100
        # print(gc_content)
        temp = mt.Tm_NN(fullseq)
        # print(temp)
        sense = 'forward' # python is zero indexed

        # add thresholds if you wanna keep padd+umi+primer temperatures that overlap the
        # PCR protocol annealing temperatures already
        # and to keep the GC content close to 50% - second filter for that
        # greater than or equal AND lesser then or equal to
        if gc_min <= gc_content <= gc_max and temp_min <= temp <= temp_max:
            row = [sense, umi, padd, primer_forward, fullseq, gc_content, temp]
            tagged_primers.loc[len(tagged_primers)] = row # appends the row
            umi_track.append(umi)
            # add to the counters
            f_count += 1
            # print('passed %s'%(f_count))
            if f_count == num_primers:
                break
print('done with forward primers')
        
r_count = 0
for umi in umis_selected: # umi_preselected[0:20000] if you want to test things fast, you can select the first e.g. 20000 rows
    if umi not in umi_track:
        fullseq = padd + umi + primer_reverse
        gc_content = gc_fraction(fullseq) * 100
        temp = mt.Tm_NN(fullseq)
        sense = 'reverse' # python is zero indexed
        
        if gc_min <= gc_content <= gc_max and temp_min <= temp <= temp_max:
            row = [sense, umi, padd, primer_reverse, fullseq, gc_content, temp]
            tagged_primers.loc[len(tagged_primers)] = row # appends the row
            umi_track.append(umi)
            r_count += 1
            if r_count == num_primers:
                break
    
print('done with reverse primers')
            
print('Umis generated under the thresholds: %s'%(len(tagged_primers)))
print('Of which %s are forward and %s are reverse'%(len(tagged_primers[tagged_primers['sense'] == 'forward']),
                                                    len(tagged_primers[tagged_primers['sense'] == 'reverse'])))

# checking for hairpins
for row in tagged_primers.itertuples():
    for nn in ['A','T','C','G']:
        seq = re.sub('[NRYKMSWBDHV]',nn, row.full_primer, flags = re.IGNORECASE)
        structures = fold(seq) # doesn't like ambiguous nucleotides
        for struct in structures:
            if 'HAIRPIN' in str(struct.desc) and struct.e < 0:
                print(f"Type: {struct.desc}, Position: {struct.ij}, Delta G: {struct.e:.2f} kcal/mol")
                print(f"Concerning primer sequence: {row.full_primer.replace}")
print('All primers checked for hairpin structures')
# save the file
tagged_primers.to_csv('./ITS9MUNngs_LR5F-ITSfull_tagged.csv', sep = '\t', index_label = 'index')

tagged_primers.head(10)

done with forward primers
done with reverse primers
Umis generated under the thresholds: 50
Of which 25 are forward and 25 are reverse
All primers checked for hairpin structures


Unnamed: 0,sense,umi_seq,padd_seq,primer_seq,full_primer,gc_content,temp
0,forward,AACCATATATCGTGTG,ATGAC,TACACACCGCCCGTCG,AACCATATATCGTGTGATGACTACACACCGCCCGTCG,51.351351,66.41418
1,forward,ATACAGATTATCTGGC,ATGAC,TACACACCGCCCGTCG,ATACAGATTATCTGGCATGACTACACACCGCCCGTCG,51.351351,66.301631
2,forward,ATATGATACTCAGCGT,ATGAC,TACACACCGCCCGTCG,ATATGATACTCAGCGTATGACTACACACCGCCCGTCG,51.351351,65.994329
3,forward,ATTAGGTACTCATGCA,ATGAC,TACACACCGCCCGTCG,ATTAGGTACTCATGCAATGACTACACACCGCCCGTCG,51.351351,66.553838
4,forward,ATATCCTGTAGCGATA,ATGAC,TACACACCGCCCGTCG,ATATCCTGTAGCGATAATGACTACACACCGCCCGTCG,51.351351,66.01419
5,forward,ACTATCGGTAACTGAT,ATGAC,TACACACCGCCCGTCG,ACTATCGGTAACTGATATGACTACACACCGCCCGTCG,51.351351,65.786334
6,forward,ACGTATAGACTCGTAT,ATGAC,TACACACCGCCCGTCG,ACGTATAGACTCGTATATGACTACACACCGCCCGTCG,51.351351,65.483666
7,forward,AGATGCTGTATCAATC,ATGAC,TACACACCGCCCGTCG,AGATGCTGTATCAATCATGACTACACACCGCCCGTCG,51.351351,66.579459
8,forward,AGCGATGCTTAATCTA,ATGAC,TACACACCGCCCGTCG,AGCGATGCTTAATCTAATGACTACACACCGCCCGTCG,51.351351,66.730449
9,forward,AGTCTACGTCGATAAT,ATGAC,TACACACCGCCCGTCG,AGTCTACGTCGATAATATGACTACACACCGCCCGTCG,51.351351,65.780944


### gITS7ngs and LR5F primers

In [23]:
# finally, calculate GC content and annealing temperature
# for the remaining 142435 umis (we need 96*2)
# we can check the first 1000 or so umis
# we will select those with compatible temperatures and good CG content

# blasting this padd against internal transcribed spacer sequences from fungi (core_nt)
# return no significant matches (too short of a sequence)

padd = 'ATGAC'

primer_forward = 'GTGARTCATCRARTYTTTG'
primer_reverse = 'CGATCGATTTGCACGTCAGA'

gc_min = 46
gc_max = 54

# no more than 5 degree difference in annealing temperatures between primer pairs
temp_min = 62
temp_max = 67

# how many tagged primers to generate
# e.g. if num_primers = 96, it will generate 96 forward and 96 reverse
num_primers = 25 # max number, we can select the "first 16 forward"

# creates the table to store information
tagged_primers = pd.DataFrame(columns = ['sense','umi_seq','padd_seq','primer_seq','full_primer','gc_content','temp'])

# I am not reinitiating this to make sure the tags in the forward primers don't repeat
# umi_track = []

f_count = 0
for umi in umis_selected: # umi_preselected[0:20000] if you want to test things fast, you can select the first e.g. 20000 rows
    if umi not in umi_track:
        fullseq = umi + padd + primer_forward
        gc_content = gc_fraction(fullseq) * 100
        temp = mt.Tm_NN(fullseq)
        sense = 'forward' # python is zero indexed

        if gc_min <= gc_content <= gc_max and temp_min <= temp <= temp_max:
            row = [sense, umi, padd, primer_forward, fullseq, gc_content, temp]
            tagged_primers.loc[len(tagged_primers)] = row # appends the row
            umi_track.append(umi)
            f_count += 1
            if f_count == num_primers:
                break
print('done with forward primers')
        
r_count = 0
for umi in umis_selected: # umi_preselected[0:20000] if you want to test things fast, you can select the first e.g. 20000 rows
    if umi not in umi_track:
        fullseq = padd + umi + primer_reverse
        gc_content = gc_fraction(fullseq) * 100
        temp = mt.Tm_NN(fullseq)
        sense = 'reverse' # python is zero indexed
        
        if gc_min <= gc_content <= gc_max and temp_min <= temp <= temp_max:
            row = [sense, umi, padd, primer_reverse, fullseq, gc_content, temp]
            tagged_primers.loc[len(tagged_primers)] = row # appends the row
            umi_track.append(umi)
            r_count += 1
            if r_count == num_primers:
                break
    
print('done with reverse primers')
            
print('Umis generated under the thresholds: %s'%(len(tagged_primers)))
print('Of which %s are forward and %s are reverse'%(len(tagged_primers[tagged_primers['sense'] == 'forward']),
                                                    len(tagged_primers[tagged_primers['sense'] == 'reverse'])))

# checking for hairpins
for row in tagged_primers.itertuples():
    for nn in ['A','T','C','G']:
        seq = re.sub('[NRYKMSWBDHV]',nn, row.full_primer, flags = re.IGNORECASE)
        structures = fold(seq) # doesn't like ambiguous nucleotides
        for struct in structures:
            if 'HAIRPIN' in str(struct.desc) and struct.e < 0:
                print(f"Type: {struct.desc}, Position: {struct.ij}, Delta G: {struct.e:.2f} kcal/mol")
                print(f"Concerning primer sequence: {row.full_primer.replace}")
print('All primers checked for hairpin structures')

# save the file
tagged_primers.to_csv('./gITS7ngs_LR5F-ITS2_tagged.csv', sep = '\t', index_label = 'index')

tagged_primers.head(10)

done with forward primers
done with reverse primers
Umis generated under the thresholds: 50
Of which 25 are forward and 25 are reverse
All primers checked for hairpin structures


Unnamed: 0,sense,umi_seq,padd_seq,primer_seq,full_primer,gc_content,temp
0,forward,ACTGAGCGGATCCTCG,ATGAC,GTGARTCATCRARTYTTTG,ACTGAGCGGATCCTCGATGACGTGARTCATCRARTYTTTG,47.222222,64.420612
1,forward,ACTGTCACGGTGCAGC,ATGAC,GTGARTCATCRARTYTTTG,ACTGTCACGGTGCAGCATGACGTGARTCATCRARTYTTTG,47.222222,65.443014
2,forward,AGCAACGTGTCTCGCG,ATGAC,GTGARTCATCRARTYTTTG,AGCAACGTGTCTCGCGATGACGTGARTCATCRARTYTTTG,47.222222,65.455563
3,forward,ACAGCTGCCTGTAGGC,ATGAC,GTGARTCATCRARTYTTTG,ACAGCTGCCTGTAGGCATGACGTGARTCATCRARTYTTTG,47.222222,65.029897
4,forward,AGCGGACGACTCTCTG,ATGAC,GTGARTCATCRARTYTTTG,AGCGGACGACTCTCTGATGACGTGARTCATCRARTYTTTG,47.222222,64.366399
5,forward,ATGTCGCGGCACGTCA,ATGAC,GTGARTCATCRARTYTTTG,ATGTCGCGGCACGTCAATGACGTGARTCATCRARTYTTTG,47.222222,65.808253
6,forward,AGCGCGCTCGATATCG,ATGAC,GTGARTCATCRARTYTTTG,AGCGCGCTCGATATCGATGACGTGARTCATCRARTYTTTG,47.222222,65.142686
7,forward,ATGACCCGTCGTGCAG,ATGAC,GTGARTCATCRARTYTTTG,ATGACCCGTCGTGCAGATGACGTGARTCATCRARTYTTTG,47.222222,65.023556
8,forward,ATTGCGACAGCGGCTC,ATGAC,GTGARTCATCRARTYTTTG,ATTGCGACAGCGGCTCATGACGTGARTCATCRARTYTTTG,47.222222,65.690245
9,forward,ATCGCCTGACTGACGG,ATGAC,GTGARTCATCRARTYTTTG,ATCGCCTGACTGACGGATGACGTGARTCATCRARTYTTTG,47.222222,64.722753


### FLR3 and FLR4 primers

32	LSU	FLR3	forward	GGTCCGTGTTTCAAGACG	LSU	55	Tedersoo et al., 2015  
33	LSU	FLR4	reverse	TGCATGGCCGAATAGGATTA	LSU	55	Tedersoo et al., 2015  

In [10]:
# Fusarium verticillioides isolate CoPTY14.5_LSU large subunit ribosomal RNA gene, partial sequence
# GenBank: PP857690.1

target = 'GGGATTGCCCTAGTAACGGCGAGTGAAGCGGCAACAGCTCAAATTTGAAATCTGGCTCTCGGGCCCGAGTTGTAATTTGTAGAGGATACTTTTGATGCGGTGCCTTCCGAGTTCCCTGGAACGGGACGCCATAGAGGGTGAGAGCCCCGTCTGGTTGGATGCCAAATCTCTGTAAAGTTCCTTCGACGAGTCGAGTAGTTTGGGAATGCTGCTCTAAATGGGAGGTATATGTCTTCTAAAGCTAAATACCGGCCAGAGACCGATAGCGCACAAGTAGAGTGATCGAAAGATGAAAAGCACTTTGAAAAGAGAGTTAAAAAGTACGTGAAATTGTTGAAAGGGAAGCGTTTATGACCAGACTTGGGCTTGGTTAATCATCTGGGGTTCTCCCCAGTGCACTTTTCCAGTCCAGGCCAGCATCAGTTTTCGCCGGGGGATAAAGACTTCGGGAATGTGGCTCTCTTCGGGGAGTGTTATAGCCCGTTGTGTAATACCCTGGCGGGGACTGAGGTTCGCGCATCTGCAAGGATGCTGGCGTAATGGTCATCAACGACCCGTCTTGAAACACGGACCAAGGAGTCGTCTTCGTATGCGAGTGTTCGGGTGTCAAACCCCTACGCGTAATGAAAGTGAACGCAGGTGAGAGCTTCGGCGCATCATCGACCGATCCTGATGTTCTCGGATGGATTTGAGTAAGAGCATACGGGGCCGGACCCGAAAGAAGGTGAACTATGCCTGTATAGGGTGAAGCCAGAGGAAACTCTGGTGGAGGCTCGCAGCGGTTCTGACGTGCAAATCGATCGTCAAATATGGGCATGGGGGCGAAAGACTAATCGAACCTTCTAGTAGCTGGTTTCCGCCGAAGTTTCCCTCAGGATAGCAGTGTTGAACTCAGTTTTATGAGGTAAAGCGAATGATTAGGGACTCGGGGGCGCTATTTAGCCTTCATCCATTCTCAAACTTTAAATATGTAAGAAGCTCTTGTTGCTTAATTGAACGTGAGCATTCGAATGTATCAACACTAGTGGGCCATTTTTGGTAAGCAGAACTGGCGATGCGGGATGAACCGAACGCGAGGTTAAGGTGCCAGAGTAGACGCTCATCAGACACCACAAAAGGTGTTAGTACATCTTGACAGCAGGACGGTGGCCATGGAAGTCGGAATCCGCTAAGGACTGTGTAACAACTCACCTGCCGAATGTACTAGCCCTGAAAATGGATGGCGCTCAAGCGTCTCACCCATACCTCGCCCTCAGGGTAGAAACGATGCCCTGAGGAGTAGGCGGACGTGGAGGTCAGTGACGAAGCCTAGGG'

umi_raw = []
while len(umi_raw) <= 100000:
    umi = create_umi()
    umi_pass1 = count_nns(umi, threshold)
    if umi_pass1 not in umi_raw and umi_pass1 != None:
        umi_raw.append(umi_pass1)
print('Raw UMIs created: %s'%(len(umi_raw)))
print(umi_raw[0:10], '\n')

# check umis for homopolymers
N = 2 # allow the longest homopolymer to have two nucleotides
P = 2 # allow a max of one homopolymer

umi_uniq = []

for umi in umi_raw:
    if check_homopolymers(umi, N, P) is False:
        pass
    else:
        if umi not in umi_uniq:
            umi_uniq.append(umi)
print('Non-homopolymer UMIs with homopolymer number threshold %s and nucleotide count threshold %s'%(P,N))
print('Remaining UMIs %s'%(len(umi_uniq)))
print(umi_uniq[0:10], '\n')

# check for matches
umi_nofmatch = []

for umi in umi_uniq:
    check_seq_match(umi, target)
    umibool, statement = check_seq_match(umi, target)
    if umibool != False: 
        if umibool not in umi_nofmatch and umibool != True:
            umi_nofmatch.append(umibool)
print('UMIs with no full match')
print('Remaining UMIs %s'%(len(umi_nofmatch)))
print(umi_nofmatch[0:10], '\n')

# check for partial matches
umi_preselected = []

for umi in umi_nofmatch:
    umibool, statement = check_partial_matches(umi, target, min_match = 6)
    if umibool != False:
        if umibool not in umi_preselected:
            umi_preselected.append(umibool)
            
print('UMIs with no partial match')
print('Remaining UMIs %s'%(len(umi_preselected))) # number of umis that pass all tests
umi_preselected[0:10] # prints the first 10 in the list

Raw UMIs created: 100001
['GCCCTAGACACGTTGT', 'CCGGGAATTATCGTTG', 'ATGAATGGTCCACTGG', 'CAACTACATGACTGGC', 'TGGGTACGATCAGCAT', 'GGAATCTGACACATTT', 'TCATAAGCGCTAGATT', 'GATATCGCATTGAGCA', 'ACGTGGAACACTGTCC', 'CGCTGTCATAGAGCCT'] 

Non-homopolymer UMIs with homopolymer number threshold 2 and nucleotide count threshold 2
Remaining UMIs 35277
['CAACTACATGACTGGC', 'TGGGTACGATCAGCAT', 'TCATAAGCGCTAGATT', 'GATATCGCATTGAGCA', 'CGCTGTCATAGAGCCT', 'CGTCGCGTACAGTAAA', 'TATCAGTCGACGGCGT', 'GTAGTATCGCTCGACA', 'TCAGGCGACTACACAT', 'CCGTGACAGTAACGAT'] 

UMIs with no full match
Remaining UMIs 35277
['CAACTACATGACTGGC', 'TGGGTACGATCAGCAT', 'TCATAAGCGCTAGATT', 'GATATCGCATTGAGCA', 'CGCTGTCATAGAGCCT', 'CGTCGCGTACAGTAAA', 'TATCAGTCGACGGCGT', 'GTAGTATCGCTCGACA', 'TCAGGCGACTACACAT', 'CCGTGACAGTAACGAT'] 

UMIs with no partial match
Remaining UMIs 27338


['CAACTACATGACTGGC',
 'TGGGTACGATCAGCAT',
 'TCATAAGCGCTAGATT',
 'GATATCGCATTGAGCA',
 'CGCTGTCATAGAGCCT',
 'TATCAGTCGACGGCGT',
 'AGACGTTTAGAGCACT',
 'TGTAGTCGCAAGCTAC',
 'TTCATCAACAGAGTGT',
 'ATAGGCGCTAGTTGCA']

In [12]:
first_nn = [umi[0] for umi in umi_preselected]
print(len(first_nn)) # should be the same number of umis in the pre-selected list
print(Counter(first_nn))

# selecting A just because all ITS primers also start with an A and might reduce bias if I decide to sequence them together (ITS and LSU)
# whether I sequence them together or not is still under discussion though.
umis_selected = [umi for umi in umi_preselected if umi[0] == 'A']
print(len(umis_selected))

27338
Counter({'C': 6899, 'A': 6890, 'T': 6847, 'G': 6702})
6890


In [13]:
# finally, calculate GC content and annealing temperature
# for the remaining 142435 umis (we need 96*2)
# we can check the first 1000 or so umis
# we will select those with compatible temperatures and good CG content

# blasting this padd against internal transcribed spacer sequences from fungi (core_nt)
# return no significant matches (too short of a sequence)

padd = 'ACTAC'

primer_forward = 'GGTCCGTGTTTCAAGACG'
primer_reverse = 'TGCATGGCCGAATAGGATTA'

gc_min = 46
gc_max = 54

# no more than 5 degree difference in annealing temperatures between primer pairs
temp_min = 62
temp_max = 67

# how many tagged primers to generate
# e.g. if num_primers = 96, it will generate 96 forward and 96 reverse
num_primers = 25 # we are not mixing these up, we can do 16 and 25

# creates the table to store information
tagged_primers = pd.DataFrame(columns = ['sense','umi_seq','padd_seq','primer_seq','full_primer','gc_content','temp'])

f_count = 0
for umi in umis_selected: # umi_preselected[0:20000] if you want to test things fast, you can select the first e.g. 20000 rows
    if umi not in umi_track:
        fullseq = umi + padd + primer_forward
        gc_content = gc_fraction(fullseq) * 100
        temp = mt.Tm_NN(fullseq)
        sense = 'forward' # python is zero indexed

        if gc_min <= gc_content <= gc_max and temp_min <= temp <= temp_max:
            row = [sense, umi, padd, primer_forward, fullseq, gc_content, temp]
            tagged_primers.loc[len(tagged_primers)] = row # appends the row
            umi_track.append(umi)
            f_count += 1
            if f_count == num_primers:
                break
print('done with forward primers')
        
r_count = 0
for umi in umis_selected: # umi_preselected[0:20000] if you want to test things fast, you can select the first e.g. 20000 rows
    if umi not in umi_track:
        fullseq = padd + umi + primer_reverse
        gc_content = gc_fraction(fullseq) * 100
        temp = mt.Tm_NN(fullseq)
        sense = 'reverse' # python is zero indexed
        
        if gc_min <= gc_content <= gc_max and temp_min <= temp <= temp_max:
            row = [sense, umi, padd, primer_reverse, fullseq, gc_content, temp]
            tagged_primers.loc[len(tagged_primers)] = row # appends the row
            umi_track.append(umi)
            r_count += 1
            if r_count == num_primers:
                break
    
print('done with reverse primers')
            
print('Umis generated under the thresholds: %s'%(len(tagged_primers)))
print('Of which %s are forward and %s are reverse'%(len(tagged_primers[tagged_primers['sense'] == 'forward']),
                                                    len(tagged_primers[tagged_primers['sense'] == 'reverse'])))

# checking for hairpins
for row in tagged_primers.itertuples():
    for nn in ['A','T','C','G']:
        seq = re.sub('[NRYKMSWBDHV]',nn, row.full_primer, flags = re.IGNORECASE)
        structures = fold(seq) # doesn't like ambiguous nucleotides
        for struct in structures:
            if 'HAIRPIN' in str(struct.desc) and struct.e < 0:
                print(f"Type: {struct.desc}, Position: {struct.ij}, Delta G: {struct.e:.2f} kcal/mol")
                print(f"Concerning primer sequence: {row.full_primer.replace}")
print('All primers checked for hairpin structures')

# save the file
tagged_primers.to_csv('./FLR3_FLR4-LSU_tagged.csv', sep = '\t', index_label = 'index')

tagged_primers.head(10)

done with forward primers
done with reverse primers
Umis generated under the thresholds: 50
Of which 25 are forward and 25 are reverse
All primers checked for hairpin structures


Unnamed: 0,sense,umi_seq,padd_seq,primer_seq,full_primer,gc_content,temp
0,forward,AGACGTTTAGAGCACT,ACTAC,GGTCCGTGTTTCAAGACG,AGACGTTTAGAGCACTACTACGGTCCGTGTTTCAAGACG,48.717949,65.927382
1,forward,ACTGTTCATAATCGCG,ACTAC,GGTCCGTGTTTCAAGACG,ACTGTTCATAATCGCGACTACGGTCCGTGTTTCAAGACG,48.717949,66.564916
2,forward,ATGAGTTCTACGGTAC,ACTAC,GGTCCGTGTTTCAAGACG,ATGAGTTCTACGGTACACTACGGTCCGTGTTTCAAGACG,48.717949,65.803531
3,forward,ACTAGTCACGCTTGAT,ACTAC,GGTCCGTGTTTCAAGACG,ACTAGTCACGCTTGATACTACGGTCCGTGTTTCAAGACG,48.717949,65.963793
4,forward,AGTCATTAGTCAGCAT,ACTAC,GGTCCGTGTTTCAAGACG,AGTCATTAGTCAGCATACTACGGTCCGTGTTTCAAGACG,46.153846,64.778868
5,forward,AGCGGTACTATTCACA,ACTAC,GGTCCGTGTTTCAAGACG,AGCGGTACTATTCACAACTACGGTCCGTGTTTCAAGACG,48.717949,66.257321
6,forward,ATCGTACCAGCTCTAG,ACTAC,GGTCCGTGTTTCAAGACG,ATCGTACCAGCTCTAGACTACGGTCCGTGTTTCAAGACG,51.282051,66.637195
7,forward,ATATCTGCTGATGCAC,ACTAC,GGTCCGTGTTTCAAGACG,ATATCTGCTGATGCACACTACGGTCCGTGTTTCAAGACG,48.717949,66.486571
8,forward,ACTAACGGCTACTGAG,ACTAC,GGTCCGTGTTTCAAGACG,ACTAACGGCTACTGAGACTACGGTCCGTGTTTCAAGACG,51.282051,66.874454
9,forward,ATGACCTCTATATGGC,ACTAC,GGTCCGTGTTTCAAGACG,ATGACCTCTATATGGCACTACGGTCCGTGTTTCAAGACG,48.717949,65.784812
