In [1]:
!wget https://ftp.ensembl.org/pub/release-114/fasta/mus_musculus/cdna/Mus_musculus.GRCm39.cdna.all.fa.gz
!gunzip Mus_musculus.GRCm39.cdna.all.fa.gz
!wget https://ftp.ensembl.org/pub/release-114/fasta/mus_musculus/ncrna/Mus_musculus.GRCm39.ncrna.fa.gz
!gunzip Mus_musculus.GRCm39.ncrna.fa.gz

--2025-05-25 15:15:29--  https://ftp.ensembl.org/pub/release-114/fasta/mus_musculus/cdna/Mus_musculus.GRCm39.cdna.all.fa.gz
Resolving ftp.ensembl.org (ftp.ensembl.org)... 193.62.193.169
Connecting to ftp.ensembl.org (ftp.ensembl.org)|193.62.193.169|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 50893631 (49M) [application/x-gzip]
Saving to: ‘Mus_musculus.GRCm39.cdna.all.fa.gz’


2025-05-25 15:15:40 (4.51 MB/s) - ‘Mus_musculus.GRCm39.cdna.all.fa.gz’ saved [50893631/50893631]

--2025-05-25 15:15:41--  https://ftp.ensembl.org/pub/release-114/fasta/mus_musculus/ncrna/Mus_musculus.GRCm39.ncrna.fa.gz
Resolving ftp.ensembl.org (ftp.ensembl.org)... 193.62.193.169
Connecting to ftp.ensembl.org (ftp.ensembl.org)|193.62.193.169|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 28414607 (27M) [application/x-gzip]
Saving to: ‘Mus_musculus.GRCm39.ncrna.fa.gz’


2025-05-25 15:15:49 (3.34 MB/s) - ‘Mus_musculus.GRCm39.ncrna.fa.gz’ saved [28414607/28414

In [2]:
# Import libraries
from Bio import SeqIO
import pandas as pd
from baitshop import probe_design as p_d

In [3]:
codebook = pd.read_csv('data/codebook_standardized.csv')
fasta_file = 'Mus_musculus.GRCm39.cdna.all.fa'
sequences = {record.id: str(record.seq) for record in SeqIO.parse(fasta_file, "fasta")}
readout_fasta = 'data/readouts.fasta'
readouts = {record.id: str(record.seq) for record in SeqIO.parse(readout_fasta, "fasta")}

In [4]:
readouts

{'RS0015': 'ATCCTCCTTCAATACATCCC',
 'RS0083': 'ACACTACCACCATTTCCTAT',
 'RS0095': 'ACTCCACTACTACTCACTCT',
 'RS0109': 'ACCCTCTAACTTCCATCACA'}

In [5]:
codebook

Unnamed: 0,name,id,RS0015,RS0083,RS0095,RS0109
0,Rorb,ENSMUST00000112832.8,1,1,1,0
1,Cux2,ENSMUST00000086317.12,1,1,0,1
2,Rbp4,ENSMUST00000112335.4,1,0,1,1


In [6]:
p_d.filter_trna_rrna(input_fasta = "Mus_musculus.GRCm39.ncrna.fa", output_fasta="tRNA_rRNA.fasta")

Filtered tRNA, rRNA, mt-tRNA, and mt-rRNA sequences saved to tRNA_rRNA.fasta


In [7]:
probes_dict = p_d.generate_probes_dict_by_gene(codebook, sequences, overlap = 29)
probes_dict = p_d.calculate_gc_and_tm(probes_dict)
filtered_probes = p_d.filter_probes_by_homopolymer(probes_dict, max_homopolymer_length=4)
filtered_probes = p_d.filter_probes_by_gc_and_tm(filtered_probes)
trna_rrna_kmers = p_d.extract_15mers_from_trna_rrna("tRNA_rRNA.fasta")
filtered_probes = p_d.filter_probes_by_trna_rrna_homology(filtered_probes, trna_rrna_kmers)
gene_to_17mers = p_d.extract_17mers_by_gene(sequences, codebook)
filtered_probes = p_d.filter_probes_by_homology_threshold(filtered_probes, gene_to_17mers, homology_threshold=0.5)
filtered_probes = p_d.filter_self_and_cross_complementary_probes(filtered_probes, min_complement_length=15)
filtered_probes_dg = p_d.filter_probes_by_deltaG_parallel(filtered_probes, min_deltaG=0, temperature=81, num_workers=16)

Rorb: 7453/9264 probes retained with homopolymeric runs of length <= 4
Cux2: 4562/5084 probes retained with homopolymeric runs of length <= 4
Rbp4: 835/908 probes retained with homopolymeric runs of length <= 4
Rorb: 6388/7453 probes retained with 0.3 < GC < 0.7 and 61 < Tm < 81
Cux2: 3703/4562 probes retained with 0.3 < GC < 0.7 and 61 < Tm < 81
Rbp4: 670/835 probes retained with 0.3 < GC < 0.7 and 61 < Tm < 81
Rorb: 6341/6388 probes retained after filtering by tRNA/rRNA homology
Cux2: 3703/3703 probes retained after filtering by tRNA/rRNA homology
Rbp4: 670/670 probes retained after filtering by tRNA/rRNA homology
Rorb: 6341/6341 probes retained after filtering by homology threshold 0.5
Cux2: 3703/3703 probes retained after filtering by homology threshold 0.5
Rbp4: 665/670 probes retained after filtering by homology threshold 0.5
Rorb: 6341/6341 probes retained after self/cross complementarity filtering
Cux2: 3703/3703 probes retained after self/cross complementarity filtering
Rbp4: 

In [8]:
selected_probes = p_d.select_probes_greedy(filtered_probes_dg, codebook, num_probes=64, target_gc=0.5, target_tm=81)

In [9]:
# Calculate and print stats for selected_probes
for gene, probes in selected_probes.items():
    num_probes = len(probes)
    avg_score = sum(probe['score'] for probe in probes) / num_probes if num_probes > 0 else 0
    avg_overlap = sum(probe['overlap'] for probe in probes) / num_probes if num_probes > 0 else 0
    max_overlap = max((probe['overlap'] for probe in probes), default=0)
    print(f"Gene: {gene}, Number of Probes: {num_probes}, Average Score: {avg_score:.2f}, Average Overlap: {avg_overlap:.2f}, Max Overlap: {max_overlap}")

Gene: Rorb, Number of Probes: 64, Average Score: 6.40, Average Overlap: 0.00, Max Overlap: 0
Gene: Cux2, Number of Probes: 64, Average Score: 5.04, Average Overlap: 0.00, Max Overlap: 0
Gene: Rbp4, Number of Probes: 64, Average Score: 10.41, Average Overlap: 4.22, Max Overlap: 15


In [10]:
selected_probes = p_d.assign_readouts_to_probes(selected_probes, codebook, readouts, num_readouts=2)

In [11]:
fprimer = SeqIO.read('data/Forward_primer.fasta', 'fasta')
rprimer = SeqIO.read('data/Reverse_primer.fasta', 'fasta')

In [12]:
p_d.export_probes_to_fasta(selected_probes, readouts, fprimer, rprimer, 'test_probes.fasta')

Probes with primers and readouts exported to test_probes.fasta
