In [None]:
import os
import subprocess
import shutil
import sys
import numpy as np
import pandas as pd
from varseek.utils import create_identity_t2g

varseek_directory = os.path.dirname(os.path.abspath(""))

conda_env_path = os.path.dirname(os.path.dirname(shutil.which("kb")))  # to get kb path
operating_system = "linux" if sys.platform.startswith("linux") else "darwin/m1"

kallisto = f"{conda_env_path}/lib/python3.10/site-packages/kb_python/bins/{operating_system}/kallisto/kallisto"  # or kallisto_k64
bustools = f"{conda_env_path}/lib/python3.10/site-packages/kb_python/bins/{operating_system}/bustools/bustools"

In [None]:
ref_fa = f"{varseek_directory}/tests/kb_files/ref_test.fa"
read1_fq = f"{varseek_directory}/tests/kb_files/reads1_test.fq"
read2_fq = f"{varseek_directory}/tests/kb_files/reads2_test.fq"
test_index = f"{varseek_directory}/tests/kb_files/index_test.idx"
test_t2g = f"{varseek_directory}/tests/kb_files/t2g_test.txt"
kb_count_out_test = f"{varseek_directory}/tests/kb_files/test_kb_count_out_March2025_with_num"
kb_count_out_test2 = f"{varseek_directory}/tests/kb_files/test_kb_count_out_March2025_without_num"

rnaseq_fastq_files_final = [read1_fq, read2_fq]
os.makedirs(kb_count_out_test, exist_ok=True)

if not os.path.exists(test_index):
    kb_ref_command = ["kb", "ref", "--workflow", "custom", "-t", str(8), "-i", test_index, "--d-list", "None", ref_fa]
    subprocess.run(kb_ref_command, check=True)

if not os.path.exists(test_t2g):
    create_identity_t2g(ref_fa, test_t2g)

if not os.path.exists(kb_count_out_test) or len(os.listdir(kb_count_out_test)) == 0:
    kb_count_command = ["kb", "count", "-t", str(2), "-i", test_index, "-g", test_t2g, "-x", "bulk", "--num", "--h5ad", "--parity", "single", "-o", kb_count_out_test, "--kallisto", kallisto, "--bustools", bustools] + rnaseq_fastq_files_final
    subprocess.run(kb_count_command, check=True)

if not os.path.exists(kb_count_out_test2) or len(os.listdir(kb_count_out_test2)) == 0:
    kb_count_command = ["kb", "count", "-t", str(2), "-i", test_index, "-g", test_t2g, "-x", "bulk", "--h5ad", "--parity", "single", "-o", kb_count_out_test2, "--kallisto", kallisto, "--bustools", bustools] + rnaseq_fastq_files_final
    subprocess.run(kb_count_command, check=True)

# kb_count_out_test = f"{varseek_directory}/tests/kb_files/test_kb_count_out_paired"
# if not os.path.exists(kb_count_out_test) or len(os.listdir(kb_count_out_test)) == 0:
#     kb_count_command = ["kb", "count", "-t", str(8), "-i", test_index, "-g", test_t2g, "-x", "bulk", "--num", "--h5ad", "--parity", "paired", "-o", kb_count_out_test] + rnaseq_fastq_files_final
#     subprocess.run(kb_count_command, check=True)

In [None]:
bus_text_file = os.path.join(kb_count_out_test, "bus_text.txt")
bus_file = os.path.join(kb_count_out_test, "output.bus")
bustools_text_command = f"{bustools} text -o {bus_text_file} -f {bus_file}"
subprocess.run(bustools_text_command, check=True, shell=True)

bus_text_file = os.path.join(kb_count_out_test2, "bus_text.txt")
bus_file = os.path.join(kb_count_out_test2, "output.bus")
bustools_text_command = f"{bustools} text -o {bus_text_file} -f {bus_file}"
subprocess.run(bustools_text_command, check=True, shell=True)

In [None]:
import importlib
import varseek.utils.seq_utils
importlib.reload(varseek.utils.varseek_clean_utils)
from varseek.utils.varseek_clean_utils import make_bus_df
bus_df = make_bus_df(kallisto_out = kb_count_out_test, fastq_file_list = rnaseq_fastq_files_final, t2g_file = test_t2g, mm = False, technology = "bulk", bustools = bustools)

In [None]:
read1_sample2_fq = f"{varseek_directory}/tests/kb_files/reads1_sample2_test.fq"
read2_sample2_fq = f"{varseek_directory}/tests/kb_files/reads2_sample2_test.fq"
rnaseq_fastq_files_final = [read1_fq, read2_fq, read1_sample2_fq, read2_sample2_fq]

kb_count_out_test_4_total_fastqs = f"{varseek_directory}/tests/kb_files/test_kb_count_out_March2025_with_num_4_total_fastqs"

if not os.path.exists(kb_count_out_test_4_total_fastqs) or len(os.listdir(kb_count_out_test_4_total_fastqs)) == 0:
    kb_count_command = ["kb", "count", "-t", str(2), "-i", test_index, "-g", test_t2g, "-x", "bulk", "--num", "--h5ad", "--parity", "single", "-o", kb_count_out_test_4_total_fastqs, "--kallisto", kallisto, "--bustools", bustools] + rnaseq_fastq_files_final
    subprocess.run(kb_count_command, check=True)

bus_text_file = os.path.join(kb_count_out_test_4_total_fastqs, "bus_text.txt")
bus_file = os.path.join(kb_count_out_test_4_total_fastqs, "output.bus")
bustools_text_command = f"{bustools} text -o {bus_text_file} -f {bus_file}"
subprocess.run(bustools_text_command, check=True, shell=True)

import importlib
import varseek.utils.seq_utils
importlib.reload(varseek.utils.varseek_clean_utils)
from varseek.utils.varseek_clean_utils import make_bus_df
bus_df = make_bus_df(kallisto_out = kb_count_out_test_4_total_fastqs, fastq_file_list = rnaseq_fastq_files_final, t2g_file = test_t2g, mm = False, technology = "bulk", bustools = bustools)

loading in transcripts
loading in barcodes
loading in fastq headers
loading in fastq headers
loading in fastq headers
loading in fastq headers
loading in ec matrix
loading in t2g df
running bustools text
loading in bus df
merging ec df into bus df
Apply the mapping function to create gene name columns
added counted in matrix column
saving bus df


Read in 12 BUS records


In [3]:
read1_sample2_fq = f"{varseek_directory}/tests/kb_files/reads1_sample2_test.fq"
read2_sample2_fq = f"{varseek_directory}/tests/kb_files/reads2_sample2_test.fq"
rnaseq_fastq_files_final = [read1_fq, read2_fq, read1_sample2_fq, read2_sample2_fq]

kb_count_out_test_4_total_fastqs_paired = f"{varseek_directory}/tests/kb_files/test_kb_count_out_March2025_with_num_4_total_fastqs_paired2"

# if not os.path.exists(kb_count_out_test_4_total_fastqs_paired) or len(os.listdir(kb_count_out_test_4_total_fastqs_paired)) == 0:
#     kb_count_command = ["kb", "count", "-t", str(2), "-i", test_index, "-g", test_t2g, "-x", "bulk", "--num", "--h5ad", "--parity", "paired", "-o", kb_count_out_test_4_total_fastqs_paired, "--kallisto", kallisto, "--bustools", bustools] + rnaseq_fastq_files_final
#     subprocess.run(kb_count_command, check=True)

# bus_text_file = os.path.join(kb_count_out_test_4_total_fastqs_paired, "bus_text.txt")
# bus_file = os.path.join(kb_count_out_test_4_total_fastqs_paired, "output.bus")
# bustools_text_command = f"{bustools} text -o {bus_text_file} -f {bus_file}"
# subprocess.run(bustools_text_command, check=True, shell=True)

# import importlib
# import varseek.utils.seq_utils
# importlib.reload(varseek.utils.varseek_clean_utils)
# from varseek.utils.varseek_clean_utils import make_bus_df

In [None]:
import importlib
import varseek.utils.seq_utils
importlib.reload(varseek.utils.varseek_clean_utils)
from varseek.utils.varseek_clean_utils import make_bus_df
df = make_bus_df(kb_count_out=kb_count_out_test_4_total_fastqs_paired, fastq_file_list=rnaseq_fastq_files_final, t2g_file=test_t2g, mm=False, technology="bulk", parity="paired", bustools=bustools, check_only=True)

File /Users/joeyrich/Desktop/local/varseek/tests/kb_files/reads1_test.fq does not match the expected bulk file naming convention of SAMPLE_PAIR.EXT where SAMPLE is sample name, PAIR is 1/2, and EXT is a fastq extension - or the Illumina file naming convention of SAMPLE_LANE_R[12]_001.fastq.gz, where SAMPLE is letters, numbers, underscores; LANE is numbers with optional leading 0s; pair is either R1 or R2; and it has .fq or .fastq extension (or .fq.gz or .fastq.gz).
File /Users/joeyrich/Desktop/local/varseek/tests/kb_files/reads2_test.fq does not match the expected bulk file naming convention of SAMPLE_PAIR.EXT where SAMPLE is sample name, PAIR is 1/2, and EXT is a fastq extension - or the Illumina file naming convention of SAMPLE_LANE_R[12]_001.fastq.gz, where SAMPLE is letters, numbers, underscores; LANE is numbers with optional leading 0s; pair is either R1 or R2; and it has .fq or .fastq extension (or .fq.gz or .fastq.gz).
File /Users/joeyrich/Desktop/local/varseek/tests/kb_files/re

Processing FASTQ headers: 3it [00:00, 2437.60it/s]
Processing FASTQ headers: 3it [00:00, 24151.46it/s]
Processing FASTQ headers: 3it [00:00, 7672.51it/s]
Processing FASTQ headers: 3it [00:00, 31223.11it/s]


loading in ec matrix
loading in t2g df
loading in bus df
Merging fastq header df and ec_df into bus df
Apply the mapping function to create gene name columns


100%|██████████| 4/4 [00:00<00:00, 7906.32it/s]


Taking set of gene_names


100%|██████████| 4/4 [00:00<00:00, 8062.09it/s]


Determining what counts in count matrix


100%|██████████| 4/4 [00:00<00:00, 4450.19it/s]

Saving bus df as parquet





In [5]:
import os
import subprocess
import shutil
import sys
import numpy as np
import pandas as pd
from time import sleep
import anndata as ad
from scipy.io import mmread
from varseek.utils import create_identity_t2g

varseek_directory = os.path.dirname(os.path.abspath(""))

conda_env_path = os.path.dirname(os.path.dirname(shutil.which("kb")))  # to get kb path
operating_system = "linux" if sys.platform.startswith("linux") else "darwin/m1"

kallisto = f"{conda_env_path}/lib/python3.10/site-packages/kb_python/bins/{operating_system}/kallisto/kallisto"  # or kallisto_k64
bustools = f"{conda_env_path}/lib/python3.10/site-packages/kb_python/bins/{operating_system}/bustools/bustools"

In [11]:
mm = False
union = False

ref_fa = f"{varseek_directory}/tests/kb_files/single_cell_tests/ref_sc_test.fa"
read1_fq = f"{varseek_directory}/tests/kb_files/single_cell_tests/reads_R1.fq"
read2_fq = f"{varseek_directory}/tests/kb_files/single_cell_tests/reads_R2.fq"
test_index = f"{varseek_directory}/tests/kb_files/single_cell_tests/index_test.idx"
test_t2g = f"{varseek_directory}/tests/kb_files/single_cell_tests/t2g_test.txt"
kb_count_out_test = f"{varseek_directory}/tests/kb_files/single_cell_tests/test_kb_count_out_hamming1"

rnaseq_fastq_files_final = [read1_fq, read2_fq]
os.makedirs(kb_count_out_test, exist_ok=True)

!rm -rf $test_index
!rm -rf $test_t2g
!rm -rf $kb_count_out_test

if not os.path.exists(test_index):
    kb_ref_command = ["kb", "ref", "--workflow", "custom", "-t", str(2), "-i", test_index, "--d-list", "None", ref_fa]
    subprocess.run(kb_ref_command, check=True)

if not os.path.exists(test_t2g):
    create_identity_t2g(ref_fa, test_t2g)

if not os.path.exists(kb_count_out_test) or len(os.listdir(kb_count_out_test)) == 0:
    kb_count_command = ["kb", "count", "-t", str(2), "-i", test_index, "-g", test_t2g, "-x", "10XV3", "--num", "--h5ad", "-o", kb_count_out_test, "--kallisto", kallisto, "--bustools", bustools]
    if mm:
        kb_count_command.append("--mm")
    if union:
        kb_count_command.append("--union")
    kb_count_command += rnaseq_fastq_files_final
    subprocess.run(kb_count_command, check=True)

bus_text_file = os.path.join(kb_count_out_test, "bus_text.txt")
bus_file = os.path.join(kb_count_out_test, "output.bus")
bustools_text_command = f"{bustools} text -o {bus_text_file} -f {bus_file}"
subprocess.run(bustools_text_command, check=True, shell=True)
sleep(1)
!cat $bus_text_file

adata_path = f"{kb_count_out_test}/counts_unfiltered/adata.h5ad"
adata = ad.read_h5ad(adata_path)
print("Count matrix: ", adata.X.toarray())
print("adata.var: ", adata.var.index)
print("adata.obs: ", adata.obs.index)

# mtx_path = f"{kb_count_out_test}/counts_unfiltered/cells_x_genes.mtx"
# mtx = mmread(mtx_path)
# print(mtx.toarray())  # same results as adata.X.toarray() (i.e., no differences with union)

import importlib
import varseek.utils.varseek_clean_utils
importlib.reload(varseek.utils.varseek_clean_utils)
from varseek.utils.varseek_clean_utils import make_bus_df
# bus_df = make_bus_df(kb_count_out = kb_count_out_test, fastq_file_list = rnaseq_fastq_files_final, t2g_file = test_t2g, mm = False, technology = "10XV3", bustools = bustools)
# read_to_ref_dict = dict(zip(bus_df['fastq_header'], bus_df['gene_names']))

[2025-03-26 16:28:43,354]    INFO [ref_custom] Indexing /Users/joeyrich/Desktop/local/varseek/tests/kb_files/single_cell_tests/ref_sc_test.fa to /Users/joeyrich/Desktop/local/varseek/tests/kb_files/single_cell_tests/index_test.idx
[2025-03-26 16:28:44,672]    INFO [ref_custom] Finished creating custom index
[2025-03-26 16:28:45,514]   DEBUG [main] Printing verbose output
[2025-03-26 16:28:47,740]   DEBUG [main] kallisto binary located at /Users/joeyrich/miniconda3/envs/varseek/lib/python3.10/site-packages/kb_python/bins/darwin/m1/kallisto/kallisto
[2025-03-26 16:28:47,740]   DEBUG [main] bustools binary located at /Users/joeyrich/miniconda3/envs/varseek/lib/python3.10/site-packages/kb_python/bins/darwin/m1/bustools/bustools
[2025-03-26 16:28:47,740]   DEBUG [main] Creating `/Users/joeyrich/Desktop/local/varseek/tests/kb_files/single_cell_tests/test_kb_count_out_hamming1/tmp` directory
[2025-03-26 16:28:47,740]   DEBUG [main] Namespace(list=False, command='count', tmp=None, keep_tmp=Fal

AAACCCAAGAAACACT	TTTTTTTTTTTT	0	1	0
AAACCCAAGAAACACT	TTTTTTTTTTTT	0	1	1
TATCAGGAGCTAAGTG	TTTTTTTTTTTT	0	1	2
TATCAGGAGCTAAGTG	TTTTTTTTTTTT	1	1	3
TATCAGGAGCTAAGTG	TTTTTTTTTTTA	1	1	4
TATCAGGAGCTAAGTG	TTTTTTTTTTCC	2	1	6
TATCAGGAGCTAAGTG	TTTTTTTTTAAA	2	1	7
GAACCCAAGAAACACT	TTTTTTTTAACA	0	1	8
Count matrix:  [[2. 0. 0. 0. 0.]
 [1. 2. 0. 0. 0.]]
adata.var:  Index(['vcrs1', 'vcrs2', 'vcrs3', 'vcrs4', 'vcrs5'], dtype='object', name='gene_id')
adata.obs:  Index(['AAACCCAAGAAACACT', 'TATCAGGAGCTAAGTG'], dtype='object', name='barcode')


In [None]:
read_to_ref_dict_gt = {
    'read0_mapsto_vcrs1_R2': ['vcrs1'],  # count_matrix_data["AAACCCAAGAAACACT"]["vcrs1"] = 1
    'read1_mapsto_vcrs1_repeat_R2': ['vcrs1'],  # because it has duplicate UMI as read0, it doesn't count for count matrix
    'read2_mapsto_vcrs1_different_barcode_R2': ['vcrs1'],  # count_matrix_data["TATCAGGAGCTAAGTG"]["vcrs1"] = 1
    'read3_mapsto_vcrs2_same_barcode_R2': ['vcrs2'],  # count_matrix_data["TATCAGGAGCTAAGTG"]["vcrs2"] = 1
    'read4_mapsto_vcrs2_different_umi_R2': ['vcrs2'],  # count_matrix_data["TATCAGGAGCTAAGTG"]["vcrs2"] = 2
    'read5_mapsto_vcrs1_and_vcrs2_union_R2': [],  # doesn't count for count matrix OR show up in bus file unless --union is used
    'read6_mapsto_vcrs3_and_vcrs4_multimap_R2': ['vcrs3', 'vcrs4', 'vcrs5'],  # doesn't count for count matrix unless --mm is used (but shows up in bus file regardless)
    'read7_mapsto_vcrs3_and_vcrs4_multimap_different_umi_R2': ['vcrs3', 'vcrs4', 'vcrs5'],  # doesn't count for count matrix unless --mm is used (but shows up in bus file regardless)
    'read8_mapsto_vcrs1_barcode1_but_hamming_distance2_R2': ['vcrs1'],  # count_matrix_data["AAACCCAAGAAACACT"]["vcrs1"] = 2
}
if union:
    read_to_ref_dict_gt['read5_mapsto_vcrs1_and_vcrs2_union_R2'] = ['vcrs1', 'vcrs2']

count_matrix_data = {
    "AAACCCAAGAAACACT": {"vcrs1": 2, "vcrs2": 0, "vcrs3": 0, "vcrs4": 0, "vcrs5": 0},
    "TATCAGGAGCTAAGTG": {"vcrs1": 1, "vcrs2": 2, "vcrs3": 0, "vcrs4": 0, "vcrs5": 0},
}
if union and mm:  # notably, won't show up in count matrix unless mm is also used
    count_matrix_data["TATCAGGAGCTAAGTG"]["vcrs1"] += (1/2)  # each unioned read adds (1/n), where n is the number of VCRSs to which the read maps
    count_matrix_data["TATCAGGAGCTAAGTG"]["vcrs2"] += (1/2)  # each unioned read adds (1/n), where n is the number of VCRSs to which the read maps
if mm:
    count_matrix_data["TATCAGGAGCTAAGTG"]["vcrs3"] += (1/3) * 2  # each multimapped read adds (1/n), where n is the number of VCRSs to which the read maps, and reads 6 and 7 are both multimappers hence the *2
    count_matrix_data["TATCAGGAGCTAAGTG"]["vcrs4"] += (1/3) * 2  # each multimapped read adds (1/n), where n is the number of VCRSs to which the read maps, and reads 6 and 7 are both multimappers hence the *2
    count_matrix_data["TATCAGGAGCTAAGTG"]["vcrs5"] += (1/3) * 2  # each multimapped read adds (1/n), where n is the number of VCRSs to which the read maps, and reads 6 and 7 are both multimappers hence the *2

df = pd.DataFrame(count_matrix_data)

# assert read_to_ref_dict == read_to_ref_dict_gt
# assert np.array_equal(adata.X.toarray(), np.array([[1., 1., 0., 0., 0., 0.]]))