# scPrinter setup for Bos taurus
* Our goal? Being to characterize differential TF profiles between peak and late lactation

In [1]:
import torch
torch.cuda.is_available()
import cupy
a = cupy.zeros((1000, 1000)) # check gpu usage

In [2]:
a

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

# Comparing TF Profiles between Peak and Late Lactation

In [3]:
%load_ext autoreload
%autoreload 2
import scprinter as scp
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import time
import pandas as pd
import numpy as np
import os
import pickle
import torch
import matplotlib as mpl
mpl.rcParams['pdf.fonttype'] = 42
from scanpy.plotting.palettes import zeileis_28
from tqdm.contrib.concurrent import *
from tqdm.auto import *
import anndata
import scanpy as sc
import json
import csv
import re
from sklearn.preprocessing import OneHotEncoder



In [4]:
work_dir="/net/talisker/home/benos/mae117/Documents/careers/opalia/the-milk-man"
genome_dir="/net/talisker/home/benos/mae117/.local/share/genomes/ARS-UCD2.0"
frags_dir_controls="/net/talisker/home/benos/mae117/Documents/careers/opalia/the-milk-man/data/controls/fragments"
frags_dir_peaks="/net/talisker/home/benos/mae117/Documents/careers/opalia/the-milk-man/data/experiments/peak/fragments"
frags_dir_late="/net/talisker/home/benos/mae117/Documents/careers/opalia/the-milk-man/data/experiments/late/fragments"

In [5]:
controls_files=os.listdir(frags_dir_controls)
controls_files_frags=[i for i in controls_files if i.endswith(".fragments.tsv.gz")]
controls_files_frags=sorted([os.path.join(frags_dir_controls, i) for i in controls_files_frags])
controls_files_samples=["_".join(re.split("[/_\\.]", i)[11:13]) for i in controls_files_frags]

In [6]:
peak_files=os.listdir(frags_dir_peaks)
peak_files_frags=[i for i in peak_files if i.endswith(".fragments.tsv.gz")]
peak_files_frags=sorted([os.path.join(frags_dir_peaks, i) for i in peak_files_frags])
peak_files_samples=["_".join(re.split("[/_\\.]", i)[11:13]) for i in peak_files_frags]

In [7]:
late_files=os.listdir(frags_dir_late)
late_files_frags=[i for i in late_files if i.endswith(".fragments.tsv.gz")]
late_files_frags=sorted([os.path.join(frags_dir_late, i) for i in late_files_frags])
late_files_samples=["_".join(re.split("[/_\\.]", i)[11:13]) for i in late_files_frags]

## Create a custom ARS-UCD2.0 for scPrinter to work on!

### We need to build a [custom genome object](https://ruochiz.com/scprinter_doc/tutorials/custom_genome_tutorial.html) for cows RE scPrinter


* Read in the chromosome sizes of the cow genome as a dictionary

In [8]:
#os.mkdir(os.path.join(work_dir, "data/custom_genomes/ARS-UCD2.0"))

# sizes have to be read in like a dictionary
chrom_sizes = {}
#with open(os.path.join(genome_dir, "bosTau9.chrom.sizes")) as f:
with open(os.path.join(genome_dir, "ARS-UCD2.0.fa.sizes")) as f:
    for line in f:
       (key, val) = line.split()
       chrom_sizes[key] = int(val)

* Create a bias file for ARDS-UCD2

In [9]:
pretrain_Tn5_bias_model = scp.datasets.pretrained_Tn5_bias_model
pretrain_Tn5_bias_model

'/net/talisker/home/benos/mae117/.cache/scprinter/Tn5_NN_model_py_v2.pt'

In [10]:
scp.genome.predict_genome_tn5_bias(fa_file=str(os.path.join(genome_dir, "ARS-UCD2.0.fa")),
                            save_name=f"{work_dir}/data/bias.h5",
                            tn5_model=pretrain_Tn5_bias_model,
                            context_radius=50,
                            device="cuda",
                            batch_size=5000)

Predicting Tn5 bias for NKLS02001987.1: 100%|██████████| 1958/1958 [03:56<00:00,  8.30it/s] 


* Create the custom genome

In [11]:
def make_btau_splits(with_chr_prefix=True, include_mt=False):
    """
    Create five 'reasonable' ARS-UCD2.0 chromosome splits (train/valid/test) in the
    same structure as the human example. Partitions are disjoint and exhaustive.

    Args:
        with_chr_prefix (bool): if True, chromosomes are named 'chr1'...'chr29','chrX','chrY' (and 'chrMT' if included)
                                if False, they are '1'...'29','X','Y' ('MT' if included)
        include_mt (bool): include mitochondrial chromosome in the partitions.

    Returns:
        List[Dict[str, List[str]]]: five dicts with keys 'train', 'valid', 'test'
    """

    # ARS-UCD2.0 autosomes 1..29 plus sex chromosomes
    autosomes = [str(i) for i in range(1, 30)]
    sex = ["X", "Y"]
    mt = ["MT"] if include_mt else []
    all_chroms = autosomes + sex + mt

    # Helper to add/remove 'chr' prefix
    def lab(c):
        #return f"chr{c}" if with_chr_prefix else c
        return f"{c}" if with_chr_prefix else c

    # ----
    # Pre-chosen, balanced-ish folds (mix large+medium+small autosomes, spread X/Y across folds)
    # These are *chromosome labels without prefix*; we’ll map to prefixed labels at the end.
    # Each fold lists its 'test' and 'valid' sets; 'train' is derived as set difference of all_chroms.

    folds_spec = [
        # Fold 1
        {
            "test":  ["1", "3", "6", "12", "24"],
            "valid": ["8", "20"],
        },
        # Fold 2
        {
            "test":  ["2", "9", "16", "21", "27"],
            "valid": ["12", "17"],
        },
        # Fold 3
        {
            "test":  ["4", "11", "15", "19", "29", "Y"],
            "valid": ["7", "22"],
        },
        # Fold 4
        {
            "test":  ["5", "10", "14", "18", "20", "26"],
            "valid": ["6", "21"],
        },
        # Fold 5
        {
            "test":  ["7", "13", "17", "23", "25", "X"],
            "valid": ["10", "18"],
        },
    ]

    # Optionally distribute MT (if included), put it into validation of Fold 1
    if include_mt:
        folds_spec[0]["valid"] = folds_spec[0]["valid"] + ["MT"]

    # Build full folds, enforcing disjoint/exhaustive partition per fold
    folds = []
    all_set = set(all_chroms)
    for spec in folds_spec:
        test = set(spec["test"])
        valid = set(spec["valid"])
        # Sanity: ensure declared sets exist
        assert test.isdisjoint(valid), "test and valid overlap in spec"
        assert test.issubset(all_set) and valid.issubset(all_set), "spec includes unknown chromosomes"
        train = sorted(all_set - test - valid, key=lambda x: (x not in sex+mt, x))  # stable-ish sort

        fold = {
            "test":  sorted([lab(c) for c in test],  key=lambda x: (x not in {lab("X"), lab("Y"), lab("MT")}, x)),
            "valid": sorted([lab(c) for c in valid], key=lambda x: (x not in {lab("X"), lab("Y"), lab("MT")}, x)),
            "train": sorted([lab(c) for c in train], key=lambda x: (x not in {lab("X"), lab("Y"), lab("MT")}, x)),
        }
        folds.append(fold)

    return folds

btau_splits = make_btau_splits(with_chr_prefix=True, include_mt=False)
btau_splits

[{'test': ['1', '12', '24', '3', '6'],
  'valid': ['20', '8'],
  'train': ['X',
   'Y',
   '10',
   '11',
   '13',
   '14',
   '15',
   '16',
   '17',
   '18',
   '19',
   '2',
   '21',
   '22',
   '23',
   '25',
   '26',
   '27',
   '28',
   '29',
   '4',
   '5',
   '7',
   '9']},
 {'test': ['16', '2', '21', '27', '9'],
  'valid': ['12', '17'],
  'train': ['X',
   'Y',
   '1',
   '10',
   '11',
   '13',
   '14',
   '15',
   '18',
   '19',
   '20',
   '22',
   '23',
   '24',
   '25',
   '26',
   '28',
   '29',
   '3',
   '4',
   '5',
   '6',
   '7',
   '8']},
 {'test': ['Y', '11', '15', '19', '29', '4'],
  'valid': ['22', '7'],
  'train': ['X',
   '1',
   '10',
   '12',
   '13',
   '14',
   '16',
   '17',
   '18',
   '2',
   '20',
   '21',
   '23',
   '24',
   '25',
   '26',
   '27',
   '28',
   '3',
   '5',
   '6',
   '8',
   '9']},
 {'test': ['10', '14', '18', '20', '26', '5'],
  'valid': ['21', '6'],
  'train': ['X',
   'Y',
   '1',
   '11',
   '12',
   '13',
   '15',
   '16',
   '1

In [12]:
momoo = scp.genome.Genome(
    name=str(os.path.join(work_dir, "data/custom_genomes/ARS-UCD2.0")),
    chrom_sizes=chrom_sizes,
    gff_file=f"{genome_dir}/ARS-UCD2.0.annotation.gtf",
    fa_file=f"{genome_dir}/ARS-UCD2.0.fa",
    bias_file=f"{work_dir}/data/bias.h5",
    blacklist_file=None,
    bg=None,
    splits=btau_splits
)

In [15]:
import pickle
pickle.dump(momoo, open(f"{work_dir}/data/custom_genomes/ARS-UCD2.0", "wb"))

## Create scPrinter

### Create the printer object

In [8]:
objects = []
with (open(f"{work_dir}/data/custom_genomes/ARS-UCD2.0", "rb")) as openfile:
    while True:
        try:
            objects.append(pickle.load(openfile))
        except EOFError:
            break

momoo=objects[0]
momoo

<scprinter.genome.Genome at 0x7f12b402a750>

In [9]:
all_frags=controls_files_frags + peak_files_frags + late_files_frags
exp_frags=peak_files_frags + late_files_frags
sample_names = (
    #['control'] * len(controls_files_frags) +
    ['peak'] * len(peak_files_frags) +
    ['late'] * len(late_files_frags)
)

In [10]:
printer_exp = scp.pp.import_fragments(
    path_to_frags=exp_frags,
    barcodes=[None] * len(exp_frags),
    savename=os.path.join(work_dir, 'profiles/ars-ucd_exp_scprinter.h5ad'),
    genome=momoo,
    min_num_fragments=1000, min_tsse=7,
    sorted_by_barcode=False,
    low_memory=False,

    # Added
    auto_detect_shift=False,    # do NOT run the shape-sensitive auto-detect routine
    plus_shift=4,               # canonical Tn5 offset for ATAC (forward strand)
    minus_shift=-5,             # canonical Tn5 offset for ATAC (reverse strand),
)

Multiple fragments files detected, it is suggested to provide sample names to avoid barcode collision


Importing fragments:   0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]



start transferring insertions


In [11]:
printer_ctrl = scp.pp.import_fragments(
    path_to_frags=controls_files_frags,
    barcodes=[None] * len(controls_files_frags),
    savename=os.path.join(work_dir, 'profiles/ars-ucd_ctrl_scprinter.h5ad'),
    genome=momoo,
    min_num_fragments=1000, min_tsse=7,
    sorted_by_barcode=False,
    low_memory=False,
    
    # Added
    auto_detect_shift=False,    # do NOT run the shape-sensitive auto-detect routine
    plus_shift=4,               # canonical Tn5 offset for ATAC (forward strand
    minus_shift=-5,             # canonical Tn5 offset for ATAC (reverse strand),
)

Multiple fragments files detected, it is suggested to provide sample names to avoid barcode collision


Importing fragments:   0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]



start transferring insertions


### Call peaks from printer object and save

In [21]:
# Call peaks, this set of peaks are recommended to train seq2PRINT model
scp.pp.call_peaks(
    printer=printer_exp,
    frag_file=exp_frags,
    cell_grouping=[None], # here we call peaks on the cells that are included in the final analyses
    group_names=['all'],
    preset='seq2PRINT',
    overwrite=False
)

# Fetched the cleaned peaks, save, it will be used in the next step
cleaned_peaks = pd.DataFrame(printer_exp.uns["peak_calling"]['all_cleaned'][:])
cleaned_peaks.to_csv(f'{work_dir}/peaks/seq2print_cleaned_narrowPeak.bed',
                     sep='\t', header=False, index=False)

running macs2 with macs2 callpeak --nomodel -t /net/talisker/home/benos/mae117/Documents/careers/opalia/the-milk-man/data/experiments/peak/fragments/SRR33155817.fragments.tsv.gz /net/talisker/home/benos/mae117/Documents/careers/opalia/the-milk-man/data/experiments/peak/fragments/SRR33155819.fragments.tsv.gz /net/talisker/home/benos/mae117/Documents/careers/opalia/the-milk-man/data/experiments/peak/fragments/SRR33155821.fragments.tsv.gz /net/talisker/home/benos/mae117/Documents/careers/opalia/the-milk-man/data/experiments/peak/fragments/SRR33155823.fragments.tsv.gz /net/talisker/home/benos/mae117/Documents/careers/opalia/the-milk-man/data/experiments/late/fragments/SRR33155816.fragments.tsv.gz /net/talisker/home/benos/mae117/Documents/careers/opalia/the-milk-man/data/experiments/late/fragments/SRR33155818.fragments.tsv.gz /net/talisker/home/benos/mae117/Documents/careers/opalia/the-milk-man/data/experiments/late/fragments/SRR33155820.fragments.tsv.gz /net/talisker/home/benos/mae117/Docu

Traceback (most recent call last):
  File "/net/talisker/home/benos/mae117/.conda/envs/tmm2/bin/macs2", line 653, in <module>
    main()
  File "/net/talisker/home/benos/mae117/.conda/envs/tmm2/bin/macs2", line 49, in main
    from MACS2.callpeak_cmd import run
  File "/net/talisker/home/benos/mae117/.conda/envs/tmm2/lib/python3.11/site-packages/MACS2/callpeak_cmd.py", line 23, in <module>
    from MACS2.OptValidator import opt_validate
  File "/net/talisker/home/benos/mae117/.conda/envs/tmm2/lib/python3.11/site-packages/MACS2/OptValidator.py", line 20, in <module>
    from MACS2.IO.Parser import BEDParser, ELANDResultParser, ELANDMultiParser, \
  File "MACS2/IO/Parser.pyx", line 25, in init MACS2.IO.Parser
  File "MACS2/IO/FixWidthTrack.pyx", line 27, in init MACS2.IO.FixWidthTrack
  File "MACS2/Pileup.pyx", line 19, in init MACS2.Pileup
  File "MACS2/IO/BedGraph.pyx", line 26, in init MACS2.IO.BedGraph
ImportError: /net/talisker/home/benos/mae117/.conda/envs/tmm2/lib/python3.11/site-

FileNotFoundError: [Errno 2] No such file or directory: '/net/talisker/home/benos/mae117/Documents/careers/opalia/the-milk-man/profiles/ars-ucd_exp_scprinter_supp/macs2/all_peaks.narrowPeak'

In [24]:
!macs2 callpeak --nomodel -t /net/talisker/home/benos/mae117/Documents/careers/opalia/the-milk-man/data/experiments/peak/fragments/SRR33155817.fragments.tsv.gz /net/talisker/home/benos/mae117/Documents/careers/opalia/the-milk-man/data/experiments/peak/fragments/SRR33155819.fragments.tsv.gz /net/talisker/home/benos/mae117/Documents/careers/opalia/the-milk-man/data/experiments/peak/fragments/SRR33155821.fragments.tsv.gz /net/talisker/home/benos/mae117/Documents/careers/opalia/the-milk-man/data/experiments/peak/fragments/SRR33155823.fragments.tsv.gz /net/talisker/home/benos/mae117/Documents/careers/opalia/the-milk-man/data/experiments/late/fragments/SRR33155816.fragments.tsv.gz /net/talisker/home/benos/mae117/Documents/careers/opalia/the-milk-man/data/experiments/late/fragments/SRR33155818.fragments.tsv.gz /net/talisker/home/benos/mae117/Documents/careers/opalia/the-milk-man/data/experiments/late/fragments/SRR33155820.fragments.tsv.gz /net/talisker/home/benos/mae117/Documents/careers/opalia/the-milk-man/data/experiments/late/fragments/SRR33155822.fragments.tsv.gz --outdir /net/talisker/home/benos/mae117/Documents/careers/opalia/the-milk-man/profiles/ars-ucd_exp_scprinter_supp/macs2 -n all -f BEDPE --nolambda --keep-dup all --call-summits --nomodel -B --SPMR --shift 75 --extsize 150 -p 0.01

Traceback (most recent call last):
  File "/net/talisker/home/benos/mae117/.conda/envs/tmm2/bin/macs2", line 653, in <module>
    main()
  File "/net/talisker/home/benos/mae117/.conda/envs/tmm2/bin/macs2", line 49, in main
    from MACS2.callpeak_cmd import run
  File "/net/talisker/home/benos/mae117/.conda/envs/tmm2/lib/python3.11/site-packages/MACS2/callpeak_cmd.py", line 23, in <module>
    from MACS2.OptValidator import opt_validate
  File "/net/talisker/home/benos/mae117/.conda/envs/tmm2/lib/python3.11/site-packages/MACS2/OptValidator.py", line 20, in <module>
    from MACS2.IO.Parser import BEDParser, ELANDResultParser, ELANDMultiParser, \
  File "MACS2/IO/Parser.pyx", line 25, in init MACS2.IO.Parser
  File "MACS2/IO/FixWidthTrack.pyx", line 27, in init MACS2.IO.FixWidthTrack
  File "MACS2/Pileup.pyx", line 19, in init MACS2.Pileup
  File "MACS2/IO/BedGraph.pyx", line 26, in init MACS2.IO.BedGraph
ImportError: /net/talisker/home/benos/mae117/.conda/envs/tmm2/lib/python3.11/site-

In [None]:
# Call peaks using chromvar preset, this set of peak are recommended to be use as cell x peak for scATAC-seq data, or analysis
scp.pp.call_peaks(
    printer=printer_ctrl,
    frag_file=controls_files_frags,
    cell_grouping=[None], # here we call peaks on the cells that are included in the final analyses
    group_names=['chromvar_all'],
    preset='chromvar',
    overwrite=False
)

# Fetched the cleaned peaks, save, it will be used in the next step
cleaned_peaks = pd.DataFrame(printer_ctrl.uns["peak_calling"]['chromvar_all_cleaned'][:])
cleaned_peaks.to_csv(f'{work_dir}/peaks/chromvar_regions.bed', sep='\t', header=False, index=False)

running macs2 with macs2 callpeak --nomodel -t /net/talisker/home/benos/mae117/Documents/careers/opalia/the-milk-man/data/controls/fragments/SRR34930215.fragments.tsv.gz /net/talisker/home/benos/mae117/Documents/careers/opalia/the-milk-man/data/controls/fragments/SRR34930216.fragments.tsv.gz /net/talisker/home/benos/mae117/Documents/careers/opalia/the-milk-man/data/controls/fragments/SRR34930217.fragments.tsv.gz /net/talisker/home/benos/mae117/Documents/careers/opalia/the-milk-man/data/controls/fragments/SRR34930218.fragments.tsv.gz /net/talisker/home/benos/mae117/Documents/careers/opalia/the-milk-man/data/controls/fragments/SRR34930219.fragments.tsv.gz /net/talisker/home/benos/mae117/Documents/careers/opalia/the-milk-man/data/controls/fragments/SRR34930220.fragments.tsv.gz /net/talisker/home/benos/mae117/Documents/careers/opalia/the-milk-man/data/controls/fragments/SRR34930221.fragments.tsv.gz /net/talisker/home/benos/mae117/Documents/careers/opalia/the-milk-man/data/controls/fragment

Traceback (most recent call last):
  File "/net/talisker/home/benos/mae117/.conda/envs/tmm2/bin/macs2", line 653, in <module>
    main()
  File "/net/talisker/home/benos/mae117/.conda/envs/tmm2/bin/macs2", line 49, in main
    from MACS2.callpeak_cmd import run
  File "/net/talisker/home/benos/mae117/.conda/envs/tmm2/lib/python3.11/site-packages/MACS2/callpeak_cmd.py", line 23, in <module>
    from MACS2.OptValidator import opt_validate
  File "/net/talisker/home/benos/mae117/.conda/envs/tmm2/lib/python3.11/site-packages/MACS2/OptValidator.py", line 20, in <module>
    from MACS2.IO.Parser import BEDParser, ELANDResultParser, ELANDMultiParser, \
  File "MACS2/IO/Parser.pyx", line 25, in init MACS2.IO.Parser
  File "MACS2/IO/FixWidthTrack.pyx", line 27, in init MACS2.IO.FixWidthTrack
  File "MACS2/Pileup.pyx", line 19, in init MACS2.Pileup
  File "MACS2/IO/BedGraph.pyx", line 26, in init MACS2.IO.BedGraph
ImportError: /net/talisker/home/benos/mae117/.conda/envs/tmm2/lib/python3.11/site-

FileNotFoundError: [Errno 2] No such file or directory: '/net/talisker/home/benos/mae117/Documents/careers/opalia/the-milk-man/profiles/ars-ucd_ctrl_scprinter_supp/macs2/chromvar_all_peaks.narrowPeak'

In [None]:
!macs2 callpeak --nomodel -t /net/talisker/home/benos/mae117/Documents/careers/opalia/the-milk-man/data/controls/fragments/SRR34930215.fragments.tsv.gz /net/talisker/home/benos/mae117/Documents/careers/opalia/the-milk-man/data/controls/fragments/SRR34930216.fragments.tsv.gz /net/talisker/home/benos/mae117/Documents/careers/opalia/the-milk-man/data/controls/fragments/SRR34930217.fragments.tsv.gz /net/talisker/home/benos/mae117/Documents/careers/opalia/the-milk-man/data/controls/fragments/SRR34930218.fragments.tsv.gz /net/talisker/home/benos/mae117/Documents/careers/opalia/the-milk-man/data/controls/fragments/SRR34930219.fragments.tsv.gz /net/talisker/home/benos/mae117/Documents/careers/opalia/the-milk-man/data/controls/fragments/SRR34930220.fragments.tsv.gz /net/talisker/home/benos/mae117/Documents/careers/opalia/the-milk-man/data/controls/fragments/SRR34930221.fragments.tsv.gz /net/talisker/home/benos/mae117/Documents/careers/opalia/the-milk-man/data/controls/fragments/SRR34930222.fragments.tsv.gz --outdir /net/talisker/home/benos/mae117/Documents/careers/opalia/the-milk-man/profiles/ars-ucd_ctrl_scprinter_supp/macs2 -n chromvar_all -f BEDPE --nolambda --keep-dup all --call-summits --nomodel -B --SPMR --shift 75 --extsize 150 -q 0.01

* compare the two sets of peaks with the different presets

In [None]:
print (pd.DataFrame(printer.uns["peak_calling"]['all_cleaned'][:]))

In [None]:
print (pd.DataFrame(printer.uns["peak_calling"]['chromvar_all_cleaned'][:]))

### Compare profiles