In [7]:
from itertools import combinations
from Levenshtein import distance

# List of primer sequences
primers = [
    "AGCTGATCGATCGTACGTAG",
    "TGCATGACTGCTAGCTACGT",
    "CGTACGCTAGTCGATGACGT",
    "GCTACGATGCTAGTCAGTAC",
    "TACGCTAGTGCATGACTCAG",
    "CTAGTACGATCGTAGCTAGT",
    "GATCGTACGCTAGTACGTAC",
    "ACTGATCGTAGCTACGTCGA",
    "TAGCTACGTGACTAGTCGAT",
    "GTCGATGACTGATCGTAGCT",
    "CGTAGCTACGTAGTCGATGC",
    "TGCATCGTAGCTGATCGTAC",
    "GACTAGCTGATCGTACGTCA",
    "CTAGCTAGTACGTCAGATGC",
    "TGACTGCTAGTCGATGACAT",
    "GATCGTACAGCTAGTACGCT",
    "CTAGTGCATCGTACGATGCA",
    "ACTGATCGATCGTAGCTGCA",
    "TAGCTGATGCTAGTCGACAT",
    "GCTAGTCAGCTAGTGACGCT"
]

# Check Levenshtein distance between all pairs
valid = True
for seq1, seq2 in combinations(primers, 2):
    if distance(seq1, seq2) < 5:
        print(f"Primers '{seq1}' and '{seq2}' have a Levenshtein distance less than 3.", distance(seq1, seq2))
        valid = False

if valid:
    print("All primers have a minimum Levenshtein distance of 3.")
else:
    print("Some primers do not meet the minimum Levenshtein distance requirement.")


Primers 'CGTACGCTAGTCGATGACGT' and 'TGACTGCTAGTCGATGACAT' have a Levenshtein distance less than 3. 4
Primers 'GATCGTACGCTAGTACGTAC' and 'GATCGTACAGCTAGTACGCT' have a Levenshtein distance less than 3. 4
Primers 'CGTAGCTACGTAGTCGATGC' and 'CTAGCTAGTACGTCAGATGC' have a Levenshtein distance less than 3. 4
Primers 'GATCGTACAGCTAGTACGCT' and 'GCTAGTCAGCTAGTGACGCT' have a Levenshtein distance less than 3. 4
Some primers do not meet the minimum Levenshtein distance requirement.


In [10]:
gene_list = open("all_new_genes_and_targets.txt")
gene_list_out = open("all_new_genes_and_targets_output.txt","w")

for line in gene_list:
    sp = line.strip().split("\t")
    if sp[1] == "":
        gene_list_out.write(sp[0].split()[0] + "\t" + " ".join(sp[0].split()[1:]) + "\t" + sp[2] + "\n")
    else:
        gene_list_out.write(line)


In [24]:
import random
from Bio.SeqUtils import MeltingTemp as mt
from Levenshtein import distance

# Function to generate a random 20 base pair sequence with balanced GC content
def generate_primer():
    bases = ['A', 'T', 'C', 'G']
    primer = ""
    while True:
        while len(primer) < 19:
            base = random.choice(bases)
            if len(primer) < 2 or (primer[-1] != base or primer[-2] != base):
                primer += base
        gc_cont = float(sum([1 if x == 'G' or x == 'C' else 0 for x in primer])) / float(len(primer))
        if gc_cont > 0.4 and gc_cont < 0.6:
            return primer
        else:
            primer = ""

# Function to check if the primer meets the Tm requirement
def is_valid_tm(primer, target_tm=58, tolerance=2):
    primer_tm = mt.Tm_NN(primer)
    return target_tm - tolerance <= primer_tm <= target_tm + tolerance

# Generate 40 primers with minimum Levenshtein distance of 3
primers = []
while len(primers) < 40:
    new_primer = generate_primer()
    if is_valid_tm(new_primer):
        if all(distance(new_primer, existing) > 4 for existing in primers):
            primers.append(new_primer)

# Print the generated primers
for i, primer in enumerate(primers, 1):
    print(f"Primer{i},{primer}")

print(mt.Tm_NN("GATCCGAGCGGTGGCAGGAC"))

Primer1,TGTTAATTGGCGCCGCACC
Primer2,GCCACGCACGCAGGTTAAT
Primer3,TGCGCTCTACACGTGCTCA
Primer4,TAACCGGCGCATCCAACCT
Primer5,ACAGTGACGGCGAGTTGCA
Primer6,TGCTAGCAACTCGGCGTGT
Primer7,CGCGCTTGAAGTTGCTGCT
Primer8,ACAGCAACCTGCCGACACT
Primer9,GCGCATCCACGGCTTCATT
Primer10,AGTGGTTGCGCGCGTTATG
Primer11,ACGCAAGGCGGAGTTCAGA
Primer12,TGAACGTGCCGCAGTTCGA
Primer13,TGGCACCATTGGCGTCCAA
Primer14,TTGGTGTGAACGTGGCGCT
Primer15,ATGCCGCATTCAAGACGCC
Primer16,TCCTGCGGCGGTTGTAGTT
Primer17,ATCCATGCGGTACCGGCAA
Primer18,CGACATGCCGTTCCGCATT
Primer19,TCTCCGCAAGGACGCCATT
Primer20,ACCGCACCGTGTTATGCGA
Primer21,AGACCACGCAGGTTCGCAA
Primer22,TGGCACGCGTTGTTCACGT
Primer23,ATGCAGTGCAGCCGTTCGT
Primer24,AATCGCCGACGTCCTCGTT
Primer25,GCAACGCGACTGCGTTGTT
Primer26,AACGATGGCTGGCCGAACT
Primer27,TTCCGACCGCTGCAAGGTT
Primer28,AGTTGCACTCCGCTACGCT
Primer29,TCCGCGCATCATTGTGCTG
Primer30,ATGCGCGCAACCTGGAACA
Primer31,ATTCGCCGTAGCAACGCCA
Primer32,ATTGCAAGCCGCGACACTG
Primer33,ACCAATCGGTTCGTCGCGA
Primer34,AAGTGTGAGCGAGGCGACA
Primer35,AGTGCAGTCGGCTT

In [45]:
# analyze at https://www.idtdna.com/calc/analyzer/home/batch
target_set_pairs = {
    "AML"  : ["TGTTAATTGGCGCCGCACC","GCCACGCACGCAGGTTAAT"],    
    "bmp_library": ["TGCGCTCTACACGTGCTCA","TAACCGGCGCATCCAACCT"],    
    "coad_list": ["ACAGTGACGGCGAGTTGCA","TGCTAGCAACTCGGCGTGT"],    
    "embryo_factors": ["CGCGCTTGAAGTTGCTGCT","ACAGCAACCTGCCGACACT"],    
    "notch_library": ["GCGCATCCACGGCTTCATT","AGTGGTTGCGCGCGTTATG"],    
    "pdac_list": ["TGGCACCATTGGCGTCCAA","TTGGTGTGAACGTGGCGCT"],  
    "stem_cell": ["ATGCCGCATTCAAGACGCC","TCCTGCGGCGGTTGTAGTT"],    
    "wnt_library": ["ATCCATGCGGTACCGGCAA","CGACATGCCGTTCCGCATT"],
    "control" : ["TCTCCGCAAGGACGCCATT","ACCGCACCGTGTTATGCGA"]
}

human_mouse_pairs = {
    "human": ["TGACCAGCGGCTGCGTTAA","AAGCCACGAGTGGTTGCCA"],
    "mouse": ["AACATTCCGCGTACGGCCA","GCCGAATGCAACGTGCCTT"],
}

In [28]:
def find_sequence_occurrences(sequence, target):
    """
    Find forward and reverse occurrences of a nucleotide sequence in a string.

    Parameters:
    sequence (str): The nucleotide sequence to search within.
    target (str): The nucleotide sequence to search for.

    Returns:
    dict: A dictionary with keys 'forward' and 'reverse', each containing a list of start indices where the target occurs.
    """
    forward_indices = []
    reverse_target = target[::-1].translate(str.maketrans("ATCG", "TAGC"))  # Reverse complement
    reverse_indices = []

    # Find forward occurrences
    index = sequence.find(target)
    while index != -1:
        forward_indices.append(index)
        index = sequence.find(target, index + 1)

    # Find reverse occurrences
    index = sequence.find(reverse_target)
    while index != -1:
        reverse_indices.append(index)
        index = sequence.find(reverse_target, index + 1)

    return {
        'forward': forward_indices,
        'reverse': reverse_indices
    }

# Example usage
sequence = "ATCGTACGATCGTAGCTAGCGTACGATCGTAGCTAGCTA"
target = "CGTAC"
result = find_sequence_occurrences(sequence, target)
print("Forward occurrences:", result['forward'])
print("Reverse occurrences:", result['reverse'])


Forward occurrences: [2, 19]
Reverse occurrences: [3, 20]


In [40]:
print(len(find_sequence_occurrences("CGTCTCTCACCGAAGGTT","CGTCTC")['forward']))
print(len(find_sequence_occurrences("CGTATCTCACCGAAGGTT","CGTCTC")['forward']))


1
0


In [48]:
guide_file = open("2024_11_03_total_guide_table.txt")
header = guide_file.readline()
bsmbI_f_plus_g = "CGTCTCTCACCG"
bsmbI_r =        "GTTTAGAGACG"
output_guide_file = open("2024_11_02_total_guide_table_sequences.txt","w")
output_guide_fasta = open("2024_11_02_total_guide_table_sequences.fasta","w")
for line in guide_file:
    sp = line.strip().split("\t")
    target_set_primers = target_set_pairs[sp[2]]
    human_mouse_primers = human_mouse_pairs[sp[3]]
    
    output_seq = target_set_primers[0] + human_mouse_primers[0] + bsmbI_f_plus_g + sp[1] + bsmbI_r + human_mouse_primers[1] + target_set_primers[1]
    has_bsmbI = find_sequence_occurrences(sp[1],"CGTCTC")
    if len(has_bsmbI['forward']) == 0 and len(has_bsmbI['reverse']) == 0:
        output_sed_desc = "_".join(line.strip().split("\t")) + "_" + target_set_primers[0] + "_" + human_mouse_primers[0] + "_" + bsmbI_f_plus_g + "_" + sp[1] + "_" + bsmbI_f_plus_g + "_" + human_mouse_primers[1] + "_" + target_set_primers[1]
        output_guide_file.write(line.strip() + output_seq + "\n")
        output_guide_fasta.write(">" + output_sed_desc + "\n" + output_seq + "\n")
    else:
        print("dropping",output_seq)
    
output_guide_file.close()
output_guide_fasta.close()

dropping TCTCCGCAAGGACGCCATTAACATTCCGCGTACGGCCACGTCTCTCACCGAACCTCGTCTCATGTACGAAGTTTAGAGACGGCCGAATGCAACGTGCCTTACCGCACCGTGTTATGCGA
dropping TCTCCGCAAGGACGCCATTAACATTCCGCGTACGGCCACGTCTCTCACCGAACGTTATAGCTTCGTCTCTGTTTAGAGACGGCCGAATGCAACGTGCCTTACCGCACCGTGTTATGCGA
dropping TCTCCGCAAGGACGCCATTAACATTCCGCGTACGGCCACGTCTCTCACCGAATGAGCGTCTCTCGATCGCGTTTAGAGACGGCCGAATGCAACGTGCCTTACCGCACCGTGTTATGCGA
dropping TCTCCGCAAGGACGCCATTAACATTCCGCGTACGGCCACGTCTCTCACCGACCGTCTCTATTATACGGCAGTTTAGAGACGGCCGAATGCAACGTGCCTTACCGCACCGTGTTATGCGA
dropping TCTCCGCAAGGACGCCATTTGACCAGCGGCTGCGTTAACGTCTCTCACCGCCTTGGCGAGACGGAGGTACGTTTAGAGACGAAGCCACGAGTGGTTGCCAACCGCACCGTGTTATGCGA
dropping TCTCCGCAAGGACGCCATTTGACCAGCGGCTGCGTTAACGTCTCTCACCGCTGCACTGTGGAGACGCCCGGTTTAGAGACGAAGCCACGAGTGGTTGCCAACCGCACCGTGTTATGCGA


In [None]:
TODO:
weird sequences "   5 ACAGTGACGGCGAGTTGCAAACATTCCGCGTACGGCCACGTCTCTCACCGNACGTCTCTCACCGGCCGAATGCAACGTGCCTTTGCTAGCAACTCGGCGTGT"

control guides