# Cloning repository and installing dependencies

In [None]:
!git clone https://github.com/pinellolab/DNA-Diffusion.git && cd DNA-Diffusion && uv sync

In [2]:
%cd DNA-Diffusion

/content/DNA-Diffusion


# Generating sequences

In [3]:
!uv run sample_hf.py sampling.number_of_samples=10 sampling.sample_batch_size=5

model:
  _target_: src.dnadiffusion.models.pretrained_unet.PretrainedUNet.from_pretrained
  pretrained_model_name_or_path: ssenan/DNA-Diffusion
data:
  _target_: src.dnadiffusion.data.dataloader.get_dataset_for_sampling
  data_path: data/K562_hESCT0_HepG2_GM12878_12k_sequences_per_group.txt
  saved_data_path: data/encode_data.pkl
  load_saved_data: true
  debug: false
  cell_types: null
diffusion:
  _target_: src.dnadiffusion.models.diffusion.Diffusion
  timesteps: 50
  beta_start: 0.0001
  beta_end: 0.2
sampling:
  checkpoint_path: ssenan/DNA-Diffusion
  sample_batch_size: 5
  number_of_samples: 10
  guidance_scale: 1.0

config.json: 100% 153/153 [00:00<00:00, 1.26MB/s]
model.safetensors: 100% 378M/378M [00:01<00:00, 308MB/s]
Model sent to cuda
Found cell types: ['GM12878_ENCLB441ZZZ', 'HepG2_ENCLB029COU', 'K562_ENCLB843GMH', 'hESCT0_ENCLB449ZZZ']
Generating 10 samples for cell GM12878_ENCLB441ZZZ
100% 2/2 [00:09<00:00,  4.87s/it]
Generating 10 samples for cell HepG2_ENCLB029COU
100% 

In [4]:
import os
import subprocess

def display_sequences(output_dir="data/outputs"):
    if not os.path.isdir(output_dir):
        print(f"Error: Directory '{output_dir}' not found.")
        return

    print(f"Displaying sequences from: {output_dir}\n")

    for filename in sorted(os.listdir(output_dir)):
        filepath = os.path.join(output_dir, filename)

        if os.path.isfile(filepath) and "gitkeep" not in filepath:
          cell_type = filename.split('_')[0]
          print(f"--- Cell Type: {cell_type} ({filename}) ---")
          result = subprocess.run(['cat', filepath], capture_output=True, text=True, check=True)
          print(result.stdout)
          print("-" * (len(cell_type) + 18) + "\n")

display_sequences()

Displaying sequences from: data/outputs

--- Cell Type: GM12878 (GM12878_ENCLB441ZZZ.txt) ---
TTGAGTTGTTTGAATAGTAACATGTACATACATAGTTTTGGTTCCCATTTGGAGTTAAATCTATGCATGACACATTCATTTCATTTGCAATACTGGTTAGTCATTAATTAGACTGAAATTTGATTGGTACTTCACTTTCTATCTCTAGTTATTGTGGTTTCTGGTGGTGTGTGCTGATCAAGGGGAGAATACAGTGCACT
AAGGACTAACTAAAACCGAAAACACCTAAGATTTTGGAATTTCCCTCCTTGCTTTCTATGTGCCTAGCTTGGAGACCATGAGGGTGTTGTTGAAACGAGAAATACATGAGGGTAGACTTTCACCTGTCGTTTCCATGAGTAAAATGATACTAAAGTAAAGTTTGAAGACAACGAGTTGTGTTACATGCTGTTTCTTTCTC
ACCACTAACAAAAACTGTTTTCAACGGAACCATTTTAGTTTCAGAACTTACCACATCTATGTGTGAGTCAGCAGAGTGAAAAGCAAAATAGGAAATATTAAGAAACACGCTCAAATAAATGTGATGTTATGGGGGAAATCCTGACTTTGTCACATGAAAAGGAAATGCAGCGTCATGGAAATAACTACCACCTACAAAGT
TACTTTCAAAAGATATATGAAAGGAAACATATTGTCTGTTTCATTTTCATTTTCTGAATTCTATCTAGCCAACCATAGTTGAACTCACAGTGTTAGCAGGGTTTAACCCCTGATTTGCCCCATTTTCGGTATAGCTCTGCACAAACTCATGACTGCCTGTCTTCTTTCAGGCAAAAGCTTTGTTCAAATTAGAGCAACTG
AGTCACTTGCGCCTAAGTGCACACAATGAGAGTAGGTGCTGCTGGAGGCTTCTCCCTGCAGCTGTTTGGGCTGCATGGGACCCGCTGACCCCACTCACCTAA

# Generating sequences with guidance scale 7.0

We provide the ability to tune the guidance scale used in the generation process. For more details about classifier-free guidance refer to its original manuscript https://arxiv.org/abs/2207.12598.

In [5]:
!uv run sample_hf.py sampling.guidance_scale=7.0 sampling.number_of_samples=1 sampling.sample_batch_size=1

model:
  _target_: src.dnadiffusion.models.pretrained_unet.PretrainedUNet.from_pretrained
  pretrained_model_name_or_path: ssenan/DNA-Diffusion
data:
  _target_: src.dnadiffusion.data.dataloader.get_dataset_for_sampling
  data_path: data/K562_hESCT0_HepG2_GM12878_12k_sequences_per_group.txt
  saved_data_path: data/encode_data.pkl
  load_saved_data: true
  debug: false
  cell_types: null
diffusion:
  _target_: src.dnadiffusion.models.diffusion.Diffusion
  timesteps: 50
  beta_start: 0.0001
  beta_end: 0.2
sampling:
  checkpoint_path: ssenan/DNA-Diffusion
  sample_batch_size: 1
  number_of_samples: 1
  guidance_scale: 7.0

Model sent to cuda
Found cell types: ['GM12878_ENCLB441ZZZ', 'HepG2_ENCLB029COU', 'K562_ENCLB843GMH', 'hESCT0_ENCLB449ZZZ']
Generating 1 samples for cell GM12878_ENCLB441ZZZ
100% 1/1 [00:02<00:00,  2.35s/it]
Generating 1 samples for cell HepG2_ENCLB029COU
100% 1/1 [00:01<00:00,  1.86s/it]
Generating 1 samples for cell K562_ENCLB843GMH
100% 1/1 [00:01<00:00,  1.87s/it]


In [6]:
display_sequences()

Displaying sequences from: data/outputs

--- Cell Type: GM12878 (GM12878_ENCLB441ZZZ.txt) ---
ACTCTCCAGCGTTGGGTTGGGGATGTCTGCAGATCTGGGTAATGTGCTCGACAGTAAGATTGAAACTGAAACTGAAACTAGAAAAGAGGAACTGAAACCAGCAGCACTGAGAAAACCCCAGACAGAACATTAGTTTCAGTTGCGGTATGTAACTCATATGACTCTAAGCAGTTACACTTTTGGGCATGGAAGCCTAACTC
-------------------------

--- Cell Type: HepG2 (HepG2_ENCLB029COU.txt) ---
TAGATACACCTGATGTACAAATATTCCATGCACATGTTCACATTCCCACAGTTAATAATTGCGCAAGAGATCAAAGTTCAGGTACTATAAATACTCCCCCTTGCACAATACTACTATTAGTTTTACAGACACAATGTAAATATTGAGCAATACACTCTAGAGGTCTGGAGTTTTAGCAGGGAACTTTTCTTTTAGGGAGT
-----------------------

--- Cell Type: K562 (K562_ENCLB843GMH.txt) ---
CTCTTGATTTGCCTCCTTGTCTTCCCTCCTCGGCCCCCTCCCTCTGGTCTTCTTCTTATCTCTCCTGTGGACCGTTATCTCTCGGGCCTGCATGCACCTTATCTGCTCACTGGCAGGCCTCCCTTATCTCTGATCTTGCATGTGCCACTGCCTCACAATCTTATATTCTACGTCACCCACAACACTCGCCTCGTCAATGG
----------------------

--- Cell Type: hESCT0 (hESCT0_ENCLB449ZZZ.txt) ---
AAGGAGGCAGTCCTAAGGAAGGGAATTCAACAAGACATTAGTGTTCCATAGAGGAAGAAGATGGCAACATTCCTTCTCCC

In [7]:
!rm data/outputs/*.txt

# Generating sequences for a specific cell type

The previous examples generate sequences for each cell type used to train the model. We can also generate sequences for a subset of the available cell types, which can be accomplished via CLI overrides. The desired cell types can be provides as comma separated string or list.

Generating just K562 sequences:

In [8]:
!uv run sample_hf.py data.cell_types=K562 sampling.number_of_samples=1 sampling.sample_batch_size=1
display_sequences()

model:
  _target_: src.dnadiffusion.models.pretrained_unet.PretrainedUNet.from_pretrained
  pretrained_model_name_or_path: ssenan/DNA-Diffusion
data:
  _target_: src.dnadiffusion.data.dataloader.get_dataset_for_sampling
  data_path: data/K562_hESCT0_HepG2_GM12878_12k_sequences_per_group.txt
  saved_data_path: data/encode_data.pkl
  load_saved_data: true
  debug: false
  cell_types: K562
diffusion:
  _target_: src.dnadiffusion.models.diffusion.Diffusion
  timesteps: 50
  beta_start: 0.0001
  beta_end: 0.2
sampling:
  checkpoint_path: ssenan/DNA-Diffusion
  sample_batch_size: 1
  number_of_samples: 1
  guidance_scale: 1.0

Matched 'K562' to 'K562_ENCLB843GMH'
Model sent to cuda
Found cell types: ['K562_ENCLB843GMH']
Generating 1 samples for cell K562_ENCLB843GMH
100% 1/1 [00:02<00:00,  2.50s/it]
Displaying sequences from: data/outputs

--- Cell Type: K562 (K562_ENCLB843GMH.txt) ---
ACTCAGGATCCTTTGTGAGTGTCTTTGGGGTCTGCTGTTATCTGCGGTTTCTGTGGCTAGATTCTCTCTTTTCAGAGGGTCAAGATGCGTCTGCTGATCAAGTCAGA

Generating both K562 and GM12878 sequences using a string CLI override

In [9]:
!uv run sample_hf.py 'data.cell_types="K562,GM12878"' sampling.number_of_samples=1 sampling.sample_batch_size=1
display_sequences()

model:
  _target_: src.dnadiffusion.models.pretrained_unet.PretrainedUNet.from_pretrained
  pretrained_model_name_or_path: ssenan/DNA-Diffusion
data:
  _target_: src.dnadiffusion.data.dataloader.get_dataset_for_sampling
  data_path: data/K562_hESCT0_HepG2_GM12878_12k_sequences_per_group.txt
  saved_data_path: data/encode_data.pkl
  load_saved_data: true
  debug: false
  cell_types: K562,GM12878
diffusion:
  _target_: src.dnadiffusion.models.diffusion.Diffusion
  timesteps: 50
  beta_start: 0.0001
  beta_end: 0.2
sampling:
  checkpoint_path: ssenan/DNA-Diffusion
  sample_batch_size: 1
  number_of_samples: 1
  guidance_scale: 1.0

Matched 'K562' to 'K562_ENCLB843GMH'
Matched 'GM12878' to 'GM12878_ENCLB441ZZZ'
Model sent to cuda
Found cell types: ['K562_ENCLB843GMH', 'GM12878_ENCLB441ZZZ']
Generating 1 samples for cell K562_ENCLB843GMH
100% 1/1 [00:02<00:00,  2.32s/it]
Generating 1 samples for cell GM12878_ENCLB441ZZZ
100% 1/1 [00:01<00:00,  1.87s/it]
Displaying sequences from: data/outpu

Generating both K562 and GM12878 sequences using a list CLI override

In [10]:
!uv run sample_hf.py 'data.cell_types=[K562,GM12878]' sampling.number_of_samples=1 sampling.sample_batch_size=1
display_sequences()

model:
  _target_: src.dnadiffusion.models.pretrained_unet.PretrainedUNet.from_pretrained
  pretrained_model_name_or_path: ssenan/DNA-Diffusion
data:
  _target_: src.dnadiffusion.data.dataloader.get_dataset_for_sampling
  data_path: data/K562_hESCT0_HepG2_GM12878_12k_sequences_per_group.txt
  saved_data_path: data/encode_data.pkl
  load_saved_data: true
  debug: false
  cell_types:
  - K562
  - GM12878
diffusion:
  _target_: src.dnadiffusion.models.diffusion.Diffusion
  timesteps: 50
  beta_start: 0.0001
  beta_end: 0.2
sampling:
  checkpoint_path: ssenan/DNA-Diffusion
  sample_batch_size: 1
  number_of_samples: 1
  guidance_scale: 1.0

Matched 'K562' to 'K562_ENCLB843GMH'
Matched 'GM12878' to 'GM12878_ENCLB441ZZZ'
Model sent to cuda
Found cell types: ['K562_ENCLB843GMH', 'GM12878_ENCLB441ZZZ']
Generating 1 samples for cell K562_ENCLB843GMH
100% 1/1 [00:02<00:00,  2.34s/it]
Generating 1 samples for cell GM12878_ENCLB441ZZZ
100% 1/1 [00:01<00:00,  1.87s/it]
Displaying sequences from: da