In [None]:
import pandas as pd

df = pd.DataFrame({
    "list_column": [["item1", "item2", "item3"]] * 5000000,
    "listlike_string_column": ['["item1", "item2", "item3"]'] * 5000000,
    "string_column": ["item1;item2;item3"] * 5000000,
    "tuple_column": [("item1", "item2", "item3")] * 5000000,
})

print("List memory:", df["list_column"].memory_usage(deep=True) / 1024 / 1024, "MB")
print("List-like string memory:", df["listlike_string_column"].memory_usage(deep=True) / 1024 / 1024, "MB")
print("String memory:", df["string_column"].memory_usage(deep=True) / 1024 / 1024, "MB")
print("Tuple memory:", df["tuple_column"].memory_usage(deep=True) / 1024 / 1024, "MB")

List memory: 381.4698486328125 MB
List-like string memory: 400.5433349609375 MB
String memory: 352.859619140625 MB
Tuple memory: 267.0289306640625 MB


<a href="https://colab.research.google.com/github/pachterlab/varseek-examples/blob/main/vk_count.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# [vk count](https://github.com/pachterlab/varseek) demonstration
Perform variant screening on scRNA-seq data with vk count, using a [10x PBMC 1k dataset](https://www.10xgenomics.com/datasets/1-k-pbm-cs-from-a-healthy-donor-v-3-chemistry-3-standard-3-0-0) against the Cosmic Cancer Mutation Census mutation adatabase as an example. Note: This requires a [COSMIC](https://cancer.sanger.ac.uk/cosmic) account.

Written by Joseph Rich.
___

### Install varseek, and import all packages

In [None]:
try:
    import varseek as vk
except ImportError:
    print("varseek not found, installing...")
    !pip install -U -q varseek

In [22]:
import os
import subprocess
import shutil

import varseek as vk

### Define important paths

In [None]:
# vk count out directory
vk_count_out_dir = os.path.join("data", "varseek_count_out")

# vk ref out directory and files - downloaded if not already present
vk_ref_out_dir = os.path.join("data", "varseek_ref_out")
vcrs_index = os.path.join(vk_ref_out_dir, "vcrs_index.idx")
vcrs_t2g = os.path.join(vk_ref_out_dir, "vcrs_t2g_filtered.txt")

# fastq directories - fastqs_dir downloaded if not already present, and fastqs_processed_dir created with fastp if not already present
fastqs_dir = os.path.join("data", "pbmc_1k_v3_fastqs")
fastqs_processed_dir = os.path.join(fastqs_dir, "filtered")
technology = "10xv3"
strand_bias_end = "3p"

# kb count to reference genome directory and files - created if not already present
account_for_strand_bias = False
qc_against_gene_matrix = False
kb_count_reference_genome_dir = os.path.join("data", "kb_count_reference_genome")
reference_genome_index = os.path.join(kb_count_reference_genome_dir, "index.idx")  # either already exists or will be created
reference_genome_t2g = os.path.join(kb_count_reference_genome_dir, "t2g.txt")  # either already exists or will be created

# kb ref genome/gtf files - used to create kb index/t2g for reference genome for the step above if reference_genome_index and reference_genome_t2g do not exist
reference_genome_fasta = "Homo_sapiens.GRCh37.dna.primary_assembly.fa"  # if reference_genome_index/reference_genome_t2g do not exist, then I need to supply the reference genome fasta and gtf
reference_genome_gtf = "Homo_sapiens.GRCh37.87.gtf"  # if reference_genome_index/reference_genome_t2g do not exist, then I need to supply the reference genome fasta and gtf

# general
threads = 2
k=51
cosmic_email = os.environ.get("COSMIC_EMAIL")
cosmic_password = os.environ.get("COSMIC_PASSWORD")

### Download the vk ref index and t2g files if they do not already exist. See vk_ref.ipynb for more details.

In [None]:
#!!! uncomment this once I get the varseek-server situation sorted or I repeat this tutorial with a new, public reference
# if not os.path.exists(vcrs_index) or not os.path.exists(vcrs_t2g):
    # vk.ref(variants="cosmic_cmc", sequences="cdna", w=47, k=51, dlist_reference_source="t2t", index_out=vcrs_index, t2g_out=vcrs_t2g, cosmic_email=cosmic_email, cosmic_password=cosmic_password, download=True)

if not os.path.exists(vcrs_index):
    vcrs_index_url = "https://caltech.box.com/shared/static/8693b78lh02fv8qh6wz6keng7cn2n91k.idx"
    vk.utils.download_box_url(vcrs_index_url, output_folder=vk_ref_out_dir, output_file_name="vcrs_index.idx")

if not os.path.exists(vcrs_t2g):
    vcrs_t2g_url = "https://caltech.box.com/shared/static/0svv7xx0mobhfzpiz7f48bjljhs72kcy.txt"
    vk.utils.download_box_url(vcrs_t2g_url, output_folder=vk_ref_out_dir, output_file_name="vcrs_t2g_filtered.txt")


12:48:06 - INFO - Successfully verified COSMIC credentials.
12:48:06 - INFO - Downloading reference files with variants=cosmic_cmc, sequences=cdna
12:48:54 - INFO - Downloaded files: {'fasta': None, 'index': 'data/varseek_ref_out_full/vcrs_index.idx', 't2g': 'data/varseek_ref_out_full/vcrs_t2g_filtered.txt'}


### Download the PBMC fastq dataset

In [5]:
if not os.path.exists(fastqs_dir) or len(os.listdir(fastqs_dir)) == 0:
    !mkdir -p data && \
        cd data && \
        curl -O https://cf.10xgenomics.com/samples/cell-exp/3.0.0/pbmc_1k_v3/pbmc_1k_v3_fastqs.tar && \
        tar -xvf pbmc_1k_v3_fastqs.tar && \
        rm pbmc_1k_v3_fastqs.tar

### (Recommended): Process the fastq data - as an example, we will use [fastp](https://github.com/OpenGene/fastp)
Note: fastp was not designed for single-cell data, and as such it requires careful thought to ensure correct processing. The procedure belows works on 10xv3 fastq files, but may need to be modified for other datasets. The function vk.utils.perform_fastp_trimming_and_filtering (called internally by vk.count) handles these nuances. While vk count would handle this step, we call it outside of vk count to demonstrate the process for clarity.

In [6]:
if shutil.which("fastp"):
    if not os.path.exists(fastqs_processed_dir) or len(os.listdir(fastqs_processed_dir)) == 0:
        os.makedirs(vk_count_out_dir, exist_ok=True)
        os.makedirs(fastqs_processed_dir, exist_ok=True)
        for file_r1, file_r2 in [("pbmc_1k_v3_S1_L001_R1_001.fastq.gz", "pbmc_1k_v3_S1_L001_R2_001.fastq.gz"), ("pbmc_1k_v3_S1_L002_R1_001.fastq.gz", "pbmc_1k_v3_S1_L002_R2_001.fastq.gz")]:
            print(f"Processing {file_r1} and {file_r2} with fastp")
            file_r1_path = os.path.join(fastqs_dir, file_r1)
            file_r2_path = os.path.join(fastqs_dir, file_r2)

            file_r1_out_path = os.path.join(fastqs_processed_dir, file_r1)
            file_r2_out_path = os.path.join(fastqs_processed_dir, file_r2)

            file_r1_out_path_tmp = os.path.join(fastqs_processed_dir, f"tmp_{os.path.basename(file_r1)}")
            file_r2_out_path_tmp = os.path.join(fastqs_processed_dir, f"tmp_{os.path.basename(file_r2)}")

            # low quality base removal - done separately from edge trimming so that we don't trim bases off of barcode + UMI file
            print(f"Low quality base removal for {file_r1} and {file_r2}")
            !fastp -i {file_r1_path} -I {file_r2_path} -o {file_r1_out_path_tmp} -O {file_r2_out_path_tmp} --disable_adapter_trimming --qualified_quality_phred 15 --unqualified_percent_limit 40 --average_qual 15 --n_base_limit 10 --disable_length_filtering --dont_eval_duplication --disable_trim_poly_g -h {vk_count_out_dir}/fastp_report1.html -j {vk_count_out_dir}/fastp_report1.json

            # edge trimming
            print(f"Edge trimming for {file_r2}")
            !fastp -i {file_r2_out_path_tmp} -o {file_r2_out_path} --cut_front --cut_tail --cut_window_size 4 --cut_mean_quality 15 --disable_quality_filtering --disable_length_filtering --dont_eval_duplication --disable_trim_poly_g -h {vk_count_out_dir}/fastp_report2.html -j {vk_count_out_dir}/fastp_report2.json --failed_out {vk_count_out_dir}/fastp_failed_tmp.fq

            if os.path.getsize(f"{vk_count_out_dir}/fastp_failed_tmp.fq") > 0:
                print(f"Removing reads from {file_r1} removed solely from {file_r2} during trimming")
                vk.utils.ensure_read_agreement(file_r1_out_path_tmp, file_r2_out_path, f"{vk_count_out_dir}/fastp_failed_tmp.fq", r1_fastq_out_path=file_r1_out_path)
            else:
                os.rename(file_r1_out_path_tmp, file_r1_out_path)

            # break
            
            os.remove(f"{vk_count_out_dir}/fastp_failed_tmp.fq")
            os.remove(file_r2_out_path_tmp)
else:
    print("fastp is not installed. Skipping fastq pre-processing")
    fastqs_processed_dir = fastqs_dir

### (Recommended): Pseudoalign the FASTQ data to the reference genome - helps with adata processing in varseek clean
While vk count would handle this step by providing the reference_genome_index and reference_genome_t2g arguments, we call it outside of vk count to demonstrate the process for clarity. For best quality results, please try to use the reference genome in this step that corresponds to the reference genome assembly and release used in varseek ref.

In [None]:
kb_count_reference_genome_adata = os.path.join(kb_count_reference_genome_dir, "counts_unfiltered", "adata.h5ad")
if qc_against_gene_matrix and not os.path.exists(kb_count_reference_genome_adata):  # check if kb count was run
    os.makedirs(kb_count_reference_genome_dir, exist_ok=True)
    if not os.path.exists(reference_genome_index) or not os.path.exists(reference_genome_t2g):  # check if kb ref was run
        if not os.path.exists(reference_genome_fasta) or not os.path.exists(reference_genome_gtf):
            reference_genome_out_dir = os.path.dirname(reference_genome_fasta) if reference_genome_fasta else "."
            # using grch37, ensembl 93 to agree with COSMIC
            !gget ref -w dna,gtf -r 93 --out_dir {reference_genome_out_dir} -d human_grch37 && gunzip {reference_genome_fasta}.gz && gunzip {reference_genome_gtf}.gz
        reference_genome_f1 = os.path.join(kb_count_reference_genome_dir, "f1.fa")
        !kb ref -t {threads} -k {k} -i {reference_genome_index} -g {reference_genome_t2g} -f1 {reference_genome_f1} {reference_genome_fasta} {reference_genome_gtf}
    
    !kb count -t {threads} -i {reference_genome_index} -g {reference_genome_t2g} -x {technology} --h5ad --num -o {kb_count_reference_genome_dir} \
        {fastqs_processed_dir}/pbmc_1k_v3_S1_L001_R1_001.fastq.gz {fastqs_processed_dir}/pbmc_1k_v3_S1_L001_R2_001.fastq.gz \
        {fastqs_processed_dir}/pbmc_1k_v3_S1_L002_R1_001.fastq.gz {fastqs_processed_dir}/pbmc_1k_v3_S1_L002_R2_001.fastq.gz

[2025-03-01 23:04:33,438]    INFO [download] Downloading files for human (standard workflow) from https://github.com/pachterlab/kallisto-transcriptome-indices/releases/download/v1/human_index_standard.tar.xz to tmp/human_index_standard.tar.xz
100%|████████████████████████████████████████| 138M/138M [00:18<00:00, 7.79MB/s]
[2025-03-01 23:04:52,077]    INFO [download] Extracting files from tmp/human_index_standard.tar.xz
[2025-03-01 23:05:11,061]    INFO [count] Using index data/kb_count_reference_genome/index.idx to generate BUS file to data/kb_count_reference_genome from
[2025-03-01 23:05:11,062]    INFO [count]         data/pbmc_1k_v3_fastqs/filtered/pbmc_1k_v3_S1_L001_R1_001.fastq.gz
[2025-03-01 23:05:11,062]    INFO [count]         data/pbmc_1k_v3_fastqs/filtered/pbmc_1k_v3_S1_L001_R2_001.fastq.gz
[2025-03-01 23:05:11,062]    INFO [count]         data/pbmc_1k_v3_fastqs/filtered/pbmc_1k_v3_S1_L002_R1_001.fastq.gz
[2025-03-01 23:05:11,062]    INFO [count]         data/pbmc_1k_v3_fastq

### Run varseek count

This will run the following commands:
- `varseek fastqpp`: Preprocess the fastq files. By default, does nothing.
- `kb count` (variant reference): Perform variant screening on fastq data utilizing kb count's pseudoalignment algorithm. Variant data is stored in an Anndata object as as cell/sample x variant matrix.
- `kb count` ("normal" reference genome) (optional): Perform pseudoalignment of fastq data to the reference genome. Only performed if utilized in the subsequence "varseek clean" step. By default, will occur if the path to the necessary files are not provided as input.
- `varseek clean`: Process the output of kb count. By default, this will threshold variant counts and ensure that there is agreement for each read between the gene of the variant to which the read aligned during the variant reference pseudoalignment and the gene to which the read aligned during the "normal" reference genome pseudoalignment.
- `varseek summarize`: Produces a text file summarizing some high-level insights from the variant screening process.

In [None]:
vk_count_output_dict = vk.count(
    fastqs_processed_dir,
    index=vcrs_index,
    t2g=vcrs_t2g,
    technology=technology,
    out=vk_count_out_dir,
    kb_count_reference_genome_dir=kb_count_reference_genome_dir,
    k=k,
    account_for_strand_bias=account_for_strand_bias, strand_bias_end=strand_bias_end,
    threads=threads,
    # quality_control_fastqs=True, cut_front=True, cut_tail=True,  # equivalent to the fastp step above
    # reference_genome_index=reference_genome_index, reference_genome_t2g=reference_genome_t2g,  # equivalent to the kb count step above
    qc_against_gene_matrix=qc_against_gene_matrix,
)

17:07:23 - INFO - Removing index files from fastq files list, as they are not utilized in kb count with technology 10xv3
17:07:23 - INFO - Setting length_required to 51 if fastqpp is run
17:07:23 - INFO - Skipping vk fastqpp because there was no use for it
17:07:23 - INFO - Running kb count with command: kb count -t 2 -k 51 -i data/varseek_ref_out_full/vcrs_index.idx -g data/varseek_ref_out_full/vcrs_t2g_filtered.txt -x 10xv3 --h5ad --parity single --strand unstranded -o data/varseek_count_out/kb_count_out_vcrs --overwrite --num data/pbmc_1k_v3_fastqs/filtered/pbmc_1k_v3_S1_L001_R1_001.fastq.gz data/pbmc_1k_v3_fastqs/filtered/pbmc_1k_v3_S1_L001_R2_001.fastq.gz data/pbmc_1k_v3_fastqs/filtered/pbmc_1k_v3_S1_L002_R1_001.fastq.gz data/pbmc_1k_v3_fastqs/filtered/pbmc_1k_v3_S1_L002_R2_001.fastq.gz
[2025-03-13 17:07:29,555]    INFO [count] Using index data/varseek_ref_out_full/vcrs_index.idx to generate BUS file to data/varseek_count_out/kb_count_out_vcrs from
[2025-03-13 17:07:29,555]    INF

In [33]:
print(f"Find summarized results in {vk_count_output_dict['vk_summarize_output_dir']}")
print(f"Find the processed adata object for further analysis in {vk_count_output_dict['adata_path']}")

Find summarized results in None
Find the processed adata object for further analysis in /Users/joeyrich/Desktop/local/varseek-examples/data/varseek_count_out/adata_cleaned.h5ad


In [None]:
# vk.clean(
#   adata_vcrs = "data/varseek_count_out/kb_count_out_vcrs/counts_unfiltered/adata.h5ad",
#   technology = "10xv3",
#   min_counts = 2,
#   use_binary_matrix = False,
#   drop_empty_columns = False,
#   apply_single_end_mode_on_paired_end_data_correction = False,
#   split_reads_by_Ns_and_low_quality_bases = False,
#   apply_dlist_correction = False,
#   qc_against_gene_matrix = False,
#   filter_cells_by_min_counts = None,
#   filter_cells_by_min_genes = None,
#   filter_genes_by_min_cells = None,
#   filter_cells_by_max_mt_content = None,
#   doublet_detection = False,
#   remove_doublets = False,
#   cpm_normalization = False,
#   sum_rows = False,
#   vcrs_id_set_to_exclusively_keep = None,
#   vcrs_id_set_to_exclude = None,
#   transcript_set_to_exclusively_keep = None,
#   transcript_set_to_exclude = None,
#   gene_set_to_exclusively_keep = None,
#   gene_set_to_exclude = None,
#   k = 51,
#   mm = False,
#   union = False,
#   parity = "single",
#   multiplexed = None,
#   sort_fastqs = True,
#   adata_reference_genome = None,
#   fastqs = ['data/pbmc_1k_v3_fastqs/filtered/pbmc_1k_v3_S1_L001_R1_001.fastq.gz', 'data/pbmc_1k_v3_fastqs/filtered/pbmc_1k_v3_S1_L001_R2_001.fastq.gz', 'data/pbmc_1k_v3_fastqs/filtered/pbmc_1k_v3_S1_L002_R1_001.fastq.gz', 'data/pbmc_1k_v3_fastqs/filtered/pbmc_1k_v3_S1_L002_R2_001.fastq.gz'],
#   vk_ref_dir = None,
#   vcrs_index = "data/varseek_ref_out/vcrs_index.idx",
#   vcrs_t2g = "data/varseek_ref_out/vcrs_t2g_filtered.txt",
#   vcrs_fasta = None,
#   dlist_fasta = None,
#   variants_updated_csv = None,
#   kb_count_vcrs_dir = "data/varseek_count_out/kb_count_out_vcrs",
#   kb_count_reference_genome_dir = "data/kb_count_reference_genome",
#   variants_updated_csv_columns_to_merge = None,
#   seq_id_column = "seq_ID",
#   gene_id_column = "gene_id",
#   out = "data/varseek_count_out",
#   adata_vcrs_clean_out = None,
#   adata_reference_genome_clean_out = None,
#   vcf_out = None,
#   save_vcf = False,
#   dry_run = True,
#   overwrite = True,
#   threads = 2,
#   logging_level = None,
#   save_logs = False,
#   log_out_dir = None,
#   parity_kb_count = "single"
# )

In [None]:
# vk.summarize(
#   adata = "data/varseek_count_out/adata_cleaned.h5ad",
#   top_values = 10,
#   technology = "10xv3",
#   gene_name_column = None,
#   out = "data/varseek_count_out/vk_summarize",
#   dry_run = True,
#   overwrite = True,
#   logging_level = None,
#   save_logs = False,
#   log_out_dir = None
# )

varseek.varseek_summarize.summarize(
  adata = "data/varseek_count_out/adata_cleaned.h5ad",
  top_values = 10,
  technology = "10xv3",
  gene_name_column = None,
  out = "data/varseek_count_out/vk_summarize",
  dry_run = True,
  overwrite = True,
  logging_level = None,
  save_logs = False,
  log_out_dir = None
)
