In [None]:
try:
    import varseek as vk
except ImportError:
    print("varseek not found, installing...")
    !pip install -U -q varseek
# try:
#     import RLSRP_2025
# except ImportError:
#     print("RLSRP_2025 not found, installing...")
#     !pip install -q git+https://github.com/pachterlab/RLSRP_2025.git

In [None]:
import os
from varseek.utils import download_box_url

RLSRP_2025_dir = os.path.dirname(os.path.abspath(""))  # if this notebook resides in RLSRP_2025/notebooks/0_data_download.ipynb, then this retrieves RLSRP_2025

12:08:16 - INFO - Old pandas version detected. Patching DataFrame.map to DataFrame.applymap


In [None]:
number_of_VCRSs_list = [5_000, 25_000, 50_000]  # integers or "all" - note that having too many unique sequences (like "all" for cosmic_cmc, which is ~5.3M) will produce an essentially empty picture
subsampling_method = "first"  # "random" or "first"
random_seed = 42  # only if random subsampling fasta file
threads = 8  # only if subsampling fasta file

# parameters
vk_ref_out_dir = os.path.join(RLSRP_2025_dir, "data", "vk_ref_out")  # either already created from vk ref, or will be created with downloaded files
w = 47  # only if downloading index/fasta files - see vk ref --list_downloadable_references for details on available options for download
k = 51  # only if downloading index/fasta files, or subsampling fasta file - see vk ref --list_downloadable_references for details on available options for download
dlist_reference_source = "t2t"  # only if downloading index/fasta files - see vk ref --list_downloadable_references for details on available options for download

# software - kallisto will be downloaded from Box if not present; Bandage must be downloaded manually (recommendations given below)
kallisto = "kallisto_base_feb25/kallisto/build/src/kallisto"
Bandage = "Bandage/Bandage_Ubuntu-x86-64_v0.9.0.AppImage"

# output paths to be downloaded (if not already present) or created
vcrs_fasta = os.path.join(vk_ref_out_dir, "vcrs_fasta.fa")
vcrs_index = os.path.join(vk_ref_out_dir, "vcrs_index.idx")
kallisto_inspect_txt_out_path = os.path.join(vk_ref_out_dir, "kallisto_inspect.txt")
gfa_out_path = os.path.join(vk_ref_out_dir, "vcrs_index.gfa")
jpg_out_path = os.path.join(vk_ref_out_dir, "vcrs_index.jpg")

In [None]:
# download the index file if it does not exist - or run scripts/run_vk_ref_fig1_3_4.py
if not os.path.exists(vcrs_index):  # download the index file via varseek ref
    import varseek as vk
    vk.ref(
        variants="cosmic_cmc",
        sequences="cdna",
        w=w,
        k=k,
        dlist_reference_source=dlist_reference_source,
        download=True,
        index_out=vcrs_index,
        fasta_out=vcrs_fasta,
        t2g_out=os.path.join(vk_ref_out_dir, "vcrs_t2g.txt")   
    )

# Install kallisto - download the pre-compiled binary with the cell below, which was generated with the following procedure
- git clone https://github.com/pachterlab/kallisto.git
- cd kallisto
- *uncomment the line here* - https://github.com/pachterlab/kallisto/blob/ba7da287c8f20400eb15e2ab8c51d0a4309fa4fc/src/Inspect.h#L145
- *replace the graph.write_sequence(std::to_string(labelA), seq.size(), seq, unitig.getData()->serialize(unitig)); line in kallisto/ext/bifrost/src/IO.tcc with the following*: `graph.write_sequence(std::to_string(labelA), seq.size(), seq, /*unitig.getData()->serialize(unitig)*/"");`
- cd ext/htslib
- autoreconf -i
- libtoolize --force
- aclocal
- autoheader
- automake --force-missing --add-missing
- cd ../..
- mkdir build && cd build
- cmake .. -DMAX_KMER_SIZE=64
- make

In [6]:
if not os.path.exists(kallisto):
    # download kallisto binary
    if os.path.dirname(kallisto):
        output_folder = os.path.dirname(kallisto)
    else:
        output_folder = "."
    CUSTOM_KALLISTO_COMPILED_BINARY_URL = "https://caltech.box.com/shared/static/4ildxlz5o2flrutwama0rpiqk8119fgb"
    download_box_url(CUSTOM_KALLISTO_COMPILED_BINARY_URL, output_folder=output_folder, output_file_name="kallisto")

# Install Bandage - see https://rrwick.github.io/Bandage/

System-dependent instructions, but here is what I used for Linux
- wget https://github.com/rrwick/Bandage/releases/download/v0.9.0/Bandage_Ubuntu-x86-64_v0.9.0_AppImage.zip
- unzip Bandage_Ubuntu-x86-64_v0.9.0_AppImage.zip -d Bandage
- cd Bandage
- chmod +x Bandage_Ubuntu-x86-64_v0.9.0.AppImage

In [6]:
if not os.path.exists(Bandage):
    print("Please download Bandage to proceed")

# Install seqtk
conda install -c bioconda -y seqtk

# Run kallisto inspect and make the plot

In [None]:
if isinstance(number_of_VCRSs_list, (int, str)):
    number_of_VCRSs_list = [number_of_VCRSs_list]

for number_of_VCRSs in number_of_VCRSs_list:
    vcrs_fasta_subsampled = os.path.join(vk_ref_out_dir, f"vcrs_fasta_{number_of_VCRSs}_{subsampling_method}_VCRSs.fa")
    vcrs_index = os.path.join(vk_ref_out_dir, f"vcrs_index_{number_of_VCRSs}_{subsampling_method}_VCRSs.idx")
    kallisto_inspect_txt_out_path = os.path.join(vk_ref_out_dir, f"kallisto_inspect_{number_of_VCRSs}_{subsampling_method}_VCRSs.txt")
    gfa_out_path = os.path.join(vk_ref_out_dir, f"vcrs_index_{number_of_VCRSs}_{subsampling_method}_VCRSs.gfa")
    jpg_out_path = os.path.join(vk_ref_out_dir, f"vcrs_index_{number_of_VCRSs}_{subsampling_method}_VCRSs.jpg")
    if number_of_VCRSs != "all":
        # old method (took head instead of random): , 
        if subsampling_method == "random":
            !seqtk sample -s {random_seed} {vcrs_fasta} {number_of_VCRSs} > {vcrs_fasta_subsampled}
        elif subsampling_method == "first":
            number_of_lines = number_of_VCRSs * 2  # assumes each fasta sequence takes exactly 1 line
            !head -n {number_of_lines} {vcrs_fasta} > {vcrs_fasta_subsampled}
        !kb ref --workflow custom -t {threads} -k {k} -i {vcrs_index} {vcrs_fasta_subsampled}
    kallist_inspect_output = !{kallisto} inspect {vcrs_index} --gfa={gfa_out_path}
    !{Bandage} image {gfa_out_path} {jpg_out_path}

    with open(kallisto_inspect_txt_out_path, "w") as f:  # can't redirect to output because some of the output is printed to stdout
        f.write("\n".join(kallist_inspect_output) + "\n")