# This code creates HCR probe pairs from targets, listed in corresponding .csv file

If python environment is not available, create it with this command and select it (kernel):
```
conda create -n hcr_probe_generator -c bioconda biopython numpy=1.23.5 pandas=1.3.5 blast openpyxl
```

In [8]:
# import modules
from maker37cb_mod import maker
import os
import pandas as pd
from contextlib import redirect_stdout

Define project name (name of your input csv file) and make sure file paths are correct:

In [9]:
project_name = "20250411_HCR_probes_Manuel"

in_path = "/home/mstemmer/repos/HCR_probe_generator/targets/"
out_path = "/home/mstemmer/repos/HCR_probe_generator/generated_probes/"
reference_path= "/home/mstemmer/repos/HCR_probe_generator/references/"

Place your input.csv file into the targets folder (same name as project) \
The reference file (into reference folder) should be an unpacked .fa file, specified also in the input .csv file. \
--> See example_input.csv file for required headers/columns! 

Required headers (all other columns will be ignored): 
```
'short','gene_name','amplifier','reference','sequence'
```
'short': abbreviated species name (e.g. 'dr' for Danio rerio)

Sort out file structures \
Check, if input file is correct!

In [10]:
input_csv = f"{project_name}.csv"
os.makedirs(f'{out_path}{project_name}', exist_ok=True)
out_project = os.path.join(f'{out_path}{project_name}')

# output folder to your generated HCR probes
print(f'You will find your HCR probes here: {out_project}')

# show input csv file with relevant columns
in_file = os.path.join(f'{in_path}{input_csv}')
input_df = pd.read_csv(in_file)
input_df = input_df[['short','gene_name','amplifier','reference','sequence']]
print()
print(f'All correct in the input csv file?')
input_df

You will find your HCR probes here: /home/mstemmer/repos/HCR_probe_generator/generated_probes/20250411_HCR_probes_Manuel

All correct in the input csv file?


Unnamed: 0,short,gene_name,amplifier,reference,sequence
0,am,insm1,B1,AmexT_v47_dna.fa,GGGCGTGGTCCAGACCAAAGAGAAGCCTGGACTGGGGCAGCGACAG...
1,am,sp5,B2,AmexT_v47_dna.fa,CAGCAATCCCTTAACTTCCTTGTGCCTGCGCACCCGGAAGTGCACG...
2,am,satb2,B3,AmexT_v47_dna.fa,CTTGCCTGTCTCTCCTCCGCGTCTCTCGCCTCCCTCTCTTCTAGCC...
3,am,maf1,B4,AmexT_v47_dna.fa,CTGCAGTACTCTCGTATGCGAGAGGATTGCGTCTGCTGGTTACCGC...
4,am,rorc,B1,AmexT_v47_dna.fa,CTGAAGTGCAGAGAGTGAGATTCTGCAGATGACAGTCACTGACAGC...
5,am,pv3,B2,AmexT_v47_dna.fa,AGGGTGACAGGAAAGGAGGGGGGGGGGCATCACAGGTGGGCGAGGC...
6,am,slc17a8,B3,AmexT_v47_dna.fa,AGCAAAGTGAATGTTCTGGGAGTGGCAGTTAGAACAAGGGCGGTTT...
7,am,slc17a9,B4,AmexT_v47_dna.fa,GACTCATAGTGCTGGTTCAGGAGTGAACTCCTCGCCGCACTCAATT...
8,am,slc1a3,B1,AmexT_v47_dna.fa,AGGAAGTAAGAACGCACCCCTTTAGAAACCCGAGACAAACCAGGTA...
9,am,tulp1,B2,AmexT_v47_dna.fa,CCAGGAGTCTGTTTTCCAAAAGACAGAGCTTGTGCTGCGGTAATGA...


Run HCR probe generator over all rows in input_csv file. \
Code will try to generate 33 pairs for each target. If that can't be reached, the generator will re-run without trying to reach that maximum.

In [11]:
for index, row in input_df.iterrows():
    print(f"--> Working on {row['short']}_{row['gene_name']}_{row['amplifier']}")
    outfile = os.path.join(f"{out_project}/{row['short']}_{row['gene_name']}_{row['amplifier']}_probes.csv")
    
    with open(os.path.join(f"{out_project}/{row['short']}_{row['gene_name']}_{row['amplifier']}_log.txt"), 'w') as f:
        with redirect_stdout(f):
            try:
                pause = 12
                polyAT = 5
                polyCG = 5
                choose = "n"
                BlastProbes = "y"
                dropout = "y"
                show = "y"
                report = "y"
                maxprobe = "y"
                numbr = 0
                db = f"{reference_path}/{row['reference']}"
                maker(row['gene_name'],row['sequence'],row['amplifier'],pause,choose,polyAT,polyCG,BlastProbes,db,dropout,show,report,maxprobe,numbr,outfile)
            except IndexError:
                maxprobe="n"
                maker(row['gene_name'],row['sequence'],row['amplifier'],pause,choose,polyAT,polyCG,BlastProbes,db,dropout,show,report,maxprobe,numbr,outfile)


# Rename probe pools & fuse all probes into single .csv and .xlsx file for IDT order
print('Fusing probes...')
all_probes = os.path.join(f"{out_project}/{project_name}_all_probes")
all_probes_df = pd.DataFrame({'Pool name': [], 'Sequence': []})

for index, row in input_df.iterrows():
    print(f"--> Fusing {row['short']}_{row['gene_name']}_{row['amplifier']}")
    
    probes = os.path.join(f"{out_project}/{row['short']}_{row['gene_name']}_{row['amplifier']}_probes.csv")

    probes_df = pd.read_csv(probes)
    print(probes_df.shape)

    new_pool_name = f"{row['short']}_{row['gene_name']}_{row['amplifier']}_{index}"
    old_pool_name = probes_df['Pool name'][1]

    probes_df['Pool name'].replace(old_pool_name, new_pool_name, inplace=True)
    probes_df.to_csv(f"{out_project}/{row['short']}_{row['gene_name']}_{row['amplifier']}_probes.csv", index=None)
    
    all_probes_df = all_probes_df.append(probes_df)
all_probes_df.to_csv(f'{all_probes}.csv', index=None)
all_probes_df.to_excel(f'{all_probes}.xlsx', index=None)


--> Working on am_insm1_B1
--> Working on am_sp5_B2
--> Working on am_satb2_B3
--> Working on am_maf1_B4
--> Working on am_rorc_B1
--> Working on am_pv3_B2
--> Working on am_slc17a8_B3
--> Working on am_slc17a9_B4
--> Working on am_slc1a3_B1
--> Working on am_tulp1_B2
--> Working on am_neurod4_B3
--> Working on dr_myo6b_B1
--> Working on ol_myo6b_B1
--> Working on ol_tekt3_B3
--> Working on dc_myo6b_B1
--> Working on dc_tekt3_B3
Fusing probes...
--> Fusing am_insm1_B1
(66, 2)
--> Fusing am_sp5_B2
(66, 2)
--> Fusing am_satb2_B3
(66, 2)
--> Fusing am_maf1_B4
(66, 2)
--> Fusing am_rorc_B1
(66, 2)
--> Fusing am_pv3_B2
(26, 2)
--> Fusing am_slc17a8_B3
(58, 2)
--> Fusing am_slc17a9_B4
(66, 2)
--> Fusing am_slc1a3_B1
(66, 2)
--> Fusing am_tulp1_B2
(64, 2)
--> Fusing am_neurod4_B3
(66, 2)
--> Fusing dr_myo6b_B1
(66, 2)
--> Fusing ol_myo6b_B1
(66, 2)
--> Fusing ol_tekt3_B3
(52, 2)
--> Fusing dc_myo6b_B1
(66, 2)
--> Fusing dc_tekt3_B3
(66, 2)
