In [None]:
import os
import subprocess
import shutil
import sys
import numpy as np
import anndata as ad
import pandas as pd
from varseek.utils import create_identity_t2g

varseek_directory = os.path.dirname(os.path.abspath(""))

conda_env_path = os.path.dirname(os.path.dirname(shutil.which("kb")))  # to get kb path
operating_system = "linux" if sys.platform.startswith("linux") else "darwin/m1"

kallisto = f"{conda_env_path}/lib/python3.10/site-packages/kb_python/bins/{operating_system}/kallisto/kallisto"  # or kallisto_k64
bustools = f"{conda_env_path}/lib/python3.10/site-packages/kb_python/bins/{operating_system}/bustools/bustools"

In [None]:
ref_fa = f"{varseek_directory}/tests/kb_files/ref_test.fa"
read1_fq = f"{varseek_directory}/tests/kb_files/reads1_test.fq"
read2_fq = f"{varseek_directory}/tests/kb_files/reads2_test.fq"
test_index = f"{varseek_directory}/tests/kb_files/index_test.idx"
test_t2g = f"{varseek_directory}/tests/kb_files/t2g_test.txt"
kb_count_out_test = f"{varseek_directory}/tests/kb_files/test_kb_count_out_March2025_with_num"
kb_count_out_test2 = f"{varseek_directory}/tests/kb_files/test_kb_count_out_March2025_without_num"

rnaseq_fastq_files_final = [read1_fq, read2_fq]
os.makedirs(kb_count_out_test, exist_ok=True)

if not os.path.exists(test_index):
    kb_ref_command = ["kb", "ref", "--workflow", "custom", "-t", str(8), "-i", test_index, "--d-list", "None", ref_fa]
    subprocess.run(kb_ref_command, check=True)

if not os.path.exists(test_t2g):
    create_identity_t2g(ref_fa, test_t2g)

if not os.path.exists(kb_count_out_test) or len(os.listdir(kb_count_out_test)) == 0:
    kb_count_command = ["kb", "count", "-t", str(2), "-i", test_index, "-g", test_t2g, "-x", "bulk", "--num", "--h5ad", "--parity", "single", "-o", kb_count_out_test, "--kallisto", kallisto, "--bustools", bustools] + rnaseq_fastq_files_final
    subprocess.run(kb_count_command, check=True)

if not os.path.exists(kb_count_out_test2) or len(os.listdir(kb_count_out_test2)) == 0:
    kb_count_command = ["kb", "count", "-t", str(2), "-i", test_index, "-g", test_t2g, "-x", "bulk", "--h5ad", "--parity", "single", "-o", kb_count_out_test2, "--kallisto", kallisto, "--bustools", bustools] + rnaseq_fastq_files_final
    subprocess.run(kb_count_command, check=True)

# kb_count_out_test = f"{varseek_directory}/tests/kb_files/test_kb_count_out_paired"
# if not os.path.exists(kb_count_out_test) or len(os.listdir(kb_count_out_test)) == 0:
#     kb_count_command = ["kb", "count", "-t", str(8), "-i", test_index, "-g", test_t2g, "-x", "bulk", "--num", "--h5ad", "--parity", "paired", "-o", kb_count_out_test] + rnaseq_fastq_files_final
#     subprocess.run(kb_count_command, check=True)

In [None]:
bus_text_file = os.path.join(kb_count_out_test, "bus_text.txt")
bus_file = os.path.join(kb_count_out_test, "output.bus")
bustools_text_command = f"{bustools} text -o {bus_text_file} -f {bus_file}"
subprocess.run(bustools_text_command, check=True, shell=True)

bus_text_file = os.path.join(kb_count_out_test2, "bus_text.txt")
bus_file = os.path.join(kb_count_out_test2, "output.bus")
bustools_text_command = f"{bustools} text -o {bus_text_file} -f {bus_file}"
subprocess.run(bustools_text_command, check=True, shell=True)

In [None]:
import importlib
import varseek.utils.seq_utils
importlib.reload(varseek.utils.varseek_clean_utils)
from varseek.utils.varseek_clean_utils import make_bus_df
bus_df = make_bus_df(kallisto_out = kb_count_out_test, fastq_file_list = rnaseq_fastq_files_final, t2g_file = test_t2g, mm = False, technology = "bulk", bustools = bustools)

In [None]:
read1_sample2_fq = f"{varseek_directory}/tests/kb_files/reads1_sample2_test.fq"
read2_sample2_fq = f"{varseek_directory}/tests/kb_files/reads2_sample2_test.fq"
rnaseq_fastq_files_final = [read1_fq, read2_fq, read1_sample2_fq, read2_sample2_fq]

kb_count_out_test_4_total_fastqs = f"{varseek_directory}/tests/kb_files/test_kb_count_out_March2025_with_num_4_total_fastqs"

if not os.path.exists(kb_count_out_test_4_total_fastqs) or len(os.listdir(kb_count_out_test_4_total_fastqs)) == 0:
    kb_count_command = ["kb", "count", "-t", str(2), "-i", test_index, "-g", test_t2g, "-x", "bulk", "--num", "--h5ad", "--parity", "single", "-o", kb_count_out_test_4_total_fastqs, "--kallisto", kallisto, "--bustools", bustools] + rnaseq_fastq_files_final
    subprocess.run(kb_count_command, check=True)

bus_text_file = os.path.join(kb_count_out_test_4_total_fastqs, "bus_text.txt")
bus_file = os.path.join(kb_count_out_test_4_total_fastqs, "output.bus")
bustools_text_command = f"{bustools} text -o {bus_text_file} -f {bus_file}"
subprocess.run(bustools_text_command, check=True, shell=True)

import importlib
import varseek.utils.seq_utils
importlib.reload(varseek.utils.varseek_clean_utils)
from varseek.utils.varseek_clean_utils import make_bus_df
bus_df = make_bus_df(kallisto_out = kb_count_out_test_4_total_fastqs, fastq_file_list = rnaseq_fastq_files_final, t2g_file = test_t2g, mm = False, technology = "bulk", bustools = bustools)

loading in transcripts
loading in barcodes
loading in fastq headers
loading in fastq headers
loading in fastq headers
loading in fastq headers
loading in ec matrix
loading in t2g df
running bustools text
loading in bus df
merging ec df into bus df
Apply the mapping function to create gene name columns
added counted in matrix column
saving bus df


Read in 12 BUS records


In [3]:
read1_sample2_fq = f"{varseek_directory}/tests/kb_files/reads1_sample2_test.fq"
read2_sample2_fq = f"{varseek_directory}/tests/kb_files/reads2_sample2_test.fq"
rnaseq_fastq_files_final = [read1_fq, read2_fq, read1_sample2_fq, read2_sample2_fq]

kb_count_out_test_4_total_fastqs_paired = f"{varseek_directory}/tests/kb_files/test_kb_count_out_March2025_with_num_4_total_fastqs_paired2"

# if not os.path.exists(kb_count_out_test_4_total_fastqs_paired) or len(os.listdir(kb_count_out_test_4_total_fastqs_paired)) == 0:
#     kb_count_command = ["kb", "count", "-t", str(2), "-i", test_index, "-g", test_t2g, "-x", "bulk", "--num", "--h5ad", "--parity", "paired", "-o", kb_count_out_test_4_total_fastqs_paired, "--kallisto", kallisto, "--bustools", bustools] + rnaseq_fastq_files_final
#     subprocess.run(kb_count_command, check=True)

# bus_text_file = os.path.join(kb_count_out_test_4_total_fastqs_paired, "bus_text.txt")
# bus_file = os.path.join(kb_count_out_test_4_total_fastqs_paired, "output.bus")
# bustools_text_command = f"{bustools} text -o {bus_text_file} -f {bus_file}"
# subprocess.run(bustools_text_command, check=True, shell=True)

# import importlib
# import varseek.utils.seq_utils
# importlib.reload(varseek.utils.varseek_clean_utils)
# from varseek.utils.varseek_clean_utils import make_bus_df

In [None]:
import importlib
import varseek.utils.seq_utils
importlib.reload(varseek.utils.varseek_clean_utils)
from varseek.utils.varseek_clean_utils import make_bus_df
df = make_bus_df(kb_count_out=kb_count_out_test_4_total_fastqs_paired, fastq_file_list=rnaseq_fastq_files_final, t2g_file=test_t2g, mm=False, technology="bulk", parity="paired", bustools=bustools, fastq_sorting_check_only=True)

File /Users/joeyrich/Desktop/local/varseek/tests/kb_files/reads1_test.fq does not match the expected bulk file naming convention of SAMPLE_PAIR.EXT where SAMPLE is sample name, PAIR is 1/2, and EXT is a fastq extension - or the Illumina file naming convention of SAMPLE_LANE_R[12]_001.fastq.gz, where SAMPLE is letters, numbers, underscores; LANE is numbers with optional leading 0s; pair is either R1 or R2; and it has .fq or .fastq extension (or .fq.gz or .fastq.gz).
File /Users/joeyrich/Desktop/local/varseek/tests/kb_files/reads2_test.fq does not match the expected bulk file naming convention of SAMPLE_PAIR.EXT where SAMPLE is sample name, PAIR is 1/2, and EXT is a fastq extension - or the Illumina file naming convention of SAMPLE_LANE_R[12]_001.fastq.gz, where SAMPLE is letters, numbers, underscores; LANE is numbers with optional leading 0s; pair is either R1 or R2; and it has .fq or .fastq extension (or .fq.gz or .fastq.gz).
File /Users/joeyrich/Desktop/local/varseek/tests/kb_files/re

Processing FASTQ headers: 3it [00:00, 2437.60it/s]
Processing FASTQ headers: 3it [00:00, 24151.46it/s]
Processing FASTQ headers: 3it [00:00, 7672.51it/s]
Processing FASTQ headers: 3it [00:00, 31223.11it/s]


loading in ec matrix
loading in t2g df
loading in bus df
Merging fastq header df and ec_df into bus df
Apply the mapping function to create gene name columns


100%|██████████| 4/4 [00:00<00:00, 7906.32it/s]


Taking set of gene_names


100%|██████████| 4/4 [00:00<00:00, 8062.09it/s]


Determining what counts in count matrix


100%|██████████| 4/4 [00:00<00:00, 4450.19it/s]

Saving bus df as parquet





In [6]:
import os
import subprocess
import shutil
import sys
import numpy as np
import pandas as pd
from time import sleep
import anndata as ad
from scipy.io import mmread
from varseek.utils import create_identity_t2g

varseek_directory = os.path.dirname(os.path.abspath(""))

conda_env_path = os.path.dirname(os.path.dirname(shutil.which("kb")))  # to get kb path
operating_system = "linux" if sys.platform.startswith("linux") else "darwin/m1"

kallisto = f"{conda_env_path}/lib/python3.10/site-packages/kb_python/bins/{operating_system}/kallisto/kallisto"  # or kallisto_k64
bustools = f"{conda_env_path}/lib/python3.10/site-packages/kb_python/bins/{operating_system}/bustools/bustools"

In [None]:
mm = True
union = True

ref_fa = f"{varseek_directory}/tests/kb_files/single_cell_tests/ref_sc_test.fa"
read1_fq = f"{varseek_directory}/tests/kb_files/single_cell_tests/reads_R1.fq"
read2_fq = f"{varseek_directory}/tests/kb_files/single_cell_tests/reads_R2.fq"
test_index = f"{varseek_directory}/tests/kb_files/single_cell_tests/index_test.idx"
test_t2g = f"{varseek_directory}/tests/kb_files/single_cell_tests/t2g_test.txt"
kb_count_out_test = f"{varseek_directory}/tests/kb_files/single_cell_tests/test_kb_count_out_hamming1_mm_and_union"

rnaseq_fastq_files_final = [read1_fq, read2_fq]
os.makedirs(kb_count_out_test, exist_ok=True)

!rm -rf $test_index
!rm -rf $test_t2g
!rm -rf $kb_count_out_test

if not os.path.exists(test_index):
    kb_ref_command = ["kb", "ref", "--workflow", "custom", "-t", str(2), "-i", test_index, "--d-list", "None", ref_fa]
    subprocess.run(kb_ref_command, check=True)

if not os.path.exists(test_t2g):
    create_identity_t2g(ref_fa, test_t2g)

if not os.path.exists(kb_count_out_test) or len(os.listdir(kb_count_out_test)) == 0:
    kb_count_command = ["kb", "count", "-t", str(2), "-i", test_index, "-g", test_t2g, "-x", "10XV3", "--num", "--h5ad", "-o", kb_count_out_test, "--kallisto", kallisto, "--bustools", bustools]
    if mm:
        kb_count_command.append("--mm")
    if union:
        kb_count_command.append("--union")
    kb_count_command += rnaseq_fastq_files_final
    subprocess.run(kb_count_command, check=True)

bus_text_file = os.path.join(kb_count_out_test, "bus_text.txt")
bus_file = os.path.join(kb_count_out_test, "output.bus")
bustools_text_command = f"{bustools} text -o {bus_text_file} -f {bus_file}"
subprocess.run(bustools_text_command, check=True, shell=True)
sleep(1)
!cat $bus_text_file

adata_path = f"{kb_count_out_test}/counts_unfiltered/adata.h5ad"
adata = ad.read_h5ad(adata_path)
print("Count matrix: ", adata.X.toarray())
print("adata.var: ", adata.var.index)
print("adata.obs: ", adata.obs.index)

# mtx_path = f"{kb_count_out_test}/counts_unfiltered/cells_x_genes.mtx"
# mtx = mmread(mtx_path)
# print(mtx.toarray())  # same results as adata.X.toarray() (i.e., no differences with union)

import importlib
import varseek.utils.varseek_clean_utils
importlib.reload(varseek.utils.varseek_clean_utils)
from varseek.utils.varseek_clean_utils import make_bus_df
# bus_df = make_bus_df(kb_count_out = kb_count_out_test, fastq_file_list = rnaseq_fastq_files_final, t2g_file = test_t2g, mm = False, technology = "10XV3", bustools = bustools)
# read_to_ref_dict = dict(zip(bus_df['fastq_header'], bus_df['gene_names']))

In [None]:
read_to_ref_dict_gt = {
    'read0_mapsto_vcrs1_R2': ['vcrs1'],  # count_matrix_data["AAACCCAAGAAACACT"]["vcrs1"] = 1
    'read1_mapsto_vcrs1_same_barcode_and_umi_R2': ['vcrs1'],  # because it has duplicate UMI as read0, it doesn't count for count matrix
    'read2_mapsto_vcrs1_different_barcode_R2': ['vcrs1'],  # count_matrix_data["TATCAGGAGCTAAGTG"]["vcrs1"] = 1
    'read3_mapsto_vcrs2_same_barcode_R2': ['vcrs2'],  # count_matrix_data["TATCAGGAGCTAAGTG"]["vcrs2"] = 1
    'read4_mapsto_vcrs2_different_umi_R2': ['vcrs2'],  # count_matrix_data["TATCAGGAGCTAAGTG"]["vcrs2"] = 2
    'read5_mapsto_vcrs1_and_vcrs2_union_R2': [],  # doesn't count for count matrix OR show up in bus file unless --union is used
    'read6_mapsto_vcrs3_and_vcrs4_and_vcrs5_multimap_R2': ['vcrs3', 'vcrs4', 'vcrs5'],  # doesn't count for count matrix unless --mm is used (but shows up in bus file regardless)
    'read7_mapsto_vcrs3_and_vcrs4_and_vcrs5_multimap_different_umi_R2': ['vcrs3', 'vcrs4', 'vcrs5'],  # doesn't count for count matrix unless --mm is used (but shows up in bus file regardless)
    'read8_mapsto_vcrs1_barcode1_but_hamming_distance2_R2': ['vcrs1'],  # count_matrix_data["AAACCCAAGAAACACT"]["vcrs1"] = 2
    'read9_mapsto_vcrs1_and_vcrs2_barcode1_same_umi_as_read10_R2': [],  # doesn't count for VCRS1 without union (won't show up in BUS file without union); VCRS2 doesn't count regardless because it has the same UMI as read10, but read10 doesn't map to VCRS2 
    'read10_mapsto_vcrs1_and_vcrs6_barcode1_same_umi_as_read9_R2': [],  # doesn't count due to same barcode and UMI as read9
}
if union:
    read_to_ref_dict_gt['read5_mapsto_vcrs1_and_vcrs2_union_R2'] = ['vcrs1', 'vcrs2']
    read_to_ref_dict_gt['read9_mapsto_vcrs1_and_vcrs2_barcode1_same_umi_as_read10_R2'] = ['vcrs1', 'vcrs2']
    read_to_ref_dict_gt['read10_mapsto_vcrs1_and_vcrs6_barcode1_same_umi_as_read9_R2'] = ['vcrs1', 'vcrs6']

count_matrix_data = {
    "AAACCCAAGAAACACT": {"vcrs1": 2, "vcrs2": 0, "vcrs3": 0, "vcrs4": 0, "vcrs5": 0, "vcrs6": 0},
    "TATCAGGAGCTAAGTG": {"vcrs1": 1, "vcrs2": 2, "vcrs3": 0, "vcrs4": 0, "vcrs5": 0, "vcrs6": 0},
}
if union and mm:  # notably, won't show up in count matrix unless mm is also used
    count_matrix_data["AAACCCAAGAAACACT"]["vcrs1"] += (1/1)  # each unioned read adds (1/n), where n is the number of VCRSs to which the read maps - this comes from read9/10
    count_matrix_data["TATCAGGAGCTAAGTG"]["vcrs1"] += (1/2)  # each unioned read adds (1/n), where n is the number of VCRSs to which the read maps - this comes from read5
    count_matrix_data["TATCAGGAGCTAAGTG"]["vcrs2"] += (1/2)  # each unioned read adds (1/n), where n is the number of VCRSs to which the read maps - this comes from read5
if mm:
    count_matrix_data["TATCAGGAGCTAAGTG"]["vcrs3"] += (1/3) * 2  # each multimapped read adds (1/n), where n is the number of VCRSs to which the read maps, and reads 6 and 7 are both multimappers hence the *2
    count_matrix_data["TATCAGGAGCTAAGTG"]["vcrs4"] += (1/3) * 2  # each multimapped read adds (1/n), where n is the number of VCRSs to which the read maps, and reads 6 and 7 are both multimappers hence the *2
    count_matrix_data["TATCAGGAGCTAAGTG"]["vcrs5"] += (1/3) * 2  # each multimapped read adds (1/n), where n is the number of VCRSs to which the read maps, and reads 6 and 7 are both multimappers hence the *2
# count_matrix_data_with_multimap = {
#     "AAACCCAAGAAACACT": {"vcrs1": 3, "vcrs2": 0, "vcrs3": 0, "vcrs4": 0, "vcrs5": 0, "vcrs6": 0},
#     "TATCAGGAGCTAAGTG": {"vcrs1": 1.5, "vcrs2": 2.5, "vcrs3": 0.67, "vcrs4": 0.67, "vcrs5": 0.67, "vcrs6": 0},
# }

df = pd.DataFrame(count_matrix_data)

# assert read_to_ref_dict == read_to_ref_dict_gt
# assert np.array_equal(adata.X.toarray(), np.array([[1., 1., 0., 0., 0., 0.]]))

In [9]:
# cdna
len("ACAAAATGGCGGCGGCGGCGGCGGCGGCTGGTGCTGCAGGGTCGGCAGCTCCCGCGGCAGCGGCCGGCGCCCCGGGATCTGGGGGCGCACCCTCAGGGTCGCAGGGGGTGCTGATCGGGGACAGGCTGTACTCCGGGGTGCTCATCACCTTGGAGAACTGCCTCCTGCCTGACGACAAGCTCCGTTTCACGCCGTCCATGTCGAGCGGCCTCGACACCGACACAGAGACCGACCTCCGCGTGGTGGGCTGCGAGCTCATCCAGGCGGCCGGTATCCTGCTCCGCCTGCCGCAGGTGGCCATGGCTACCGGGCAGGTGTTGTTCCAGCGGTTCTTTTATACCAAGTCCTTCGTGAAGCACTCCATGGAGCATGTGTCAATGGCCTGTGTCCACCTGGCTTCCAAGATAGAAGAGGCCCCAAGACGCATACGGGACGTCATCAATGTGTTTCACCGCCTTCGACAGCTGAGAGACAAAAAGAAGCCCGTGCCTCTACTACTGGATCAAGATTATGTTAATTTAAAGAACCAAATTATAAAGGCGGAAAGACGAGTTCTCAAAGAGTTGGGTTTCTGCGTCCATGTGAAGCATCCTCATAAGATAATCGTTATGTACCTTCAGGTGTTAGAGTGTGAGCGTAACCAACACCTGGTCCAGACCTCATGGGTAGCCTCTGAGGACCCCTTGTTGAAATGGGACAGTTGGCAGCGGCTCTGATGAGCCCGAGAAGAGGCCTGCCCTTGGGTGCGGAGTCTCCCTCCGCACGATGCTCCCACGCGTCCAACTTGCACCCAAGGGGCTTTTCCCTCTTCCAAGTGGACTCCTTCAAGGAAGCTGCAGCTCGGTCAGCAGAGAAGGGGCCTGCCGCCAGCGCCCTGGAGGAAGAGGAAGAGGAACCCAAGAGGATGGCTTGTCTCCCAGCAGCCACACCGGCTTTGTGCTCAGCCAGTTCATTTGAGTTTGCATGTTTCTCTGCACTATGGATTTTGAGCATTTAGATTTCTTTAATCAAAAGCGTTTTAGTGACTCCAGTAGACATTTTCTTTCTGAGGCATCGTGCTTTGCATGAGAGCAGGCCAAGGTTGAGGGGAAAAGTAAAGTTAAAGTCGGTTCTCTTTCATAGCAACACGTATTGTCTGACATTCAGCCAGCTTTTTTTTTTTCTAATAATTTCTGTGCCTTTCTGTCCTGTATTTACTGTATTTAGAAAAAGCAGCTAGAATATTTCTCCATTAACTCTTGAGATTCACAGGACTGTCTAGCTCTGAGTCCTAGCAATAGACTCCTTAGAGGAGTAGTACGTTTATCTAGATTTTCTCTAGATAATGCAGGCGGAAGACCTGGGTTCCCGGGTGGGGCATTGCAGTTCTTCCTGTGTTTGGCTTCCAGGAATTACATGAACGACAGCCTTCGCACCGACGTCTTCGTGCGGTTCCAGCCAGAGAGCATCGCCTGTGCCTGCATTTATCTTGCTGCCCGGACGCTGGAGATCCCTTTGCCCAATCGTCCCCATTGGTTTCTTTTGTTTGGAGCAACTGAAGAAGAAATTCAGGAAATCTGCTTAAAGATCTTGCAGCTTTATGCTCGGAAAAAGGTTGATCTCACACACCTGGAGGGTGAAGTGGAAAAAAGAAAGCACGCTATCGAAGAGGCAAAGGCCCAAGCCCGGGGCCTGTTGCCTGGGGGCACACAGGTGCTGGATGGTACCTCGGGGTTCTCTCCTGCCCCCAAGCTGGTGGAATCCCCCAAAGAAGGTAAAGGGAGCAAGCCTTCCCCACTGTCTGTGAAGAACACCAAGAGGAGGCTGGAGGGCGCCAAGAAAGCCAAGGCGGACAGCCCCGTGAACGGCTTGCCAAAGGGGCGAGAGAGTCGGAGTCGGAGCCGGAGCCGTGAGCAGAGCTACTCGAGGTCCCCATCCCGATCAGCGTCTCCTAAGAGGAGGAAAAGTGACAGCGGCTCCACATCTGGTGGGTCCAAGTCGCAGAGCCGCTCCCGGAGCAGGAGTGACTCCCCACCGAGACAGGCCCCCCGCAGCGCTCCCTACAAAGGCTCTGAGATTCGGGGCTCCCGGAAGTCCAAGGACTGCAAGTACCCCCAGAAGCCACACAAGTCTCGGAGCCGGAGTTCTTCCCGTTCTCGAAGCAGGTCACGGGAGCGGGCGGATAATCCGGGAAAATACAAGAAGAAAAGTCATTACTACAGAGATCAGCGACGAGAGCGCTCGAGGTCGTATGAACGCACAGGCCGTCGCTATGAGCGGGACCACCCTGGGCACAGCAGGCATCGGAGGTGAGGCGGGGTTGCAGTGACTGGTGGCCGCAAGCCCTTCCCTGGGGAGTACCTGATGGCTGCCCTTTGACCCCCGGTGGCTGCCCTTTGACCCCCGGGTGTGCTCTCAGCGCAAGTGGTCCTAGAACAGGATTCTTTTTGGAAATGTCTGTCGACTGGACCTTGGTGGATTTGGAAATGGAACTGAGGGACCGGTGACACGTGCTTCAGACCGGTCTGGGGTGCGGCGCACACCTGGGCCCGTGCAGGGCTCAGCTCGGCAGCAGCTCTGAGGGCAGCTCAATGAAAAAGTGAATGCACACGCCCTTGTTGGCGTGGCCTGGCATGGCCTGGTGCTATCGGCAGCCGCTCTCCACTCCCCGACTGATACTCAATTACGTGAAGCCAAGAAAGATGATTTTTAGAACCTTTGCCTATATTAGGTTGTACTTATGTACATATTTTGCAGTGTTTCACAGGAGAAAGTGGCCTTAACTGCCCCTTATTCTCTCTCCACGTTGTAAATAAACATGTGTTTAATACAAGTT")

2815

In [8]:
# cds
len("ATGGCGGCGGCGGCGGCGGCGGCTGGTGCTGCAGGGTCGGCAGCTCCCGCGGCAGCGGCCGGCGCCCCGGGATCTGGGGGCGCACCCTCAGGGTCGCAGGGGGTGCTGATCGGGGACAGGCTGTACTCCGGGGTGCTCATCACCTTGGAGAACTGCCTCCTGCCTGACGACAAGCTCCGTTTCACGCCGTCCATGTCGAGCGGCCTCGACACCGACACAGAGACCGACCTCCGCGTGGTGGGCTGCGAGCTCATCCAGGCGGCCGGTATCCTGCTCCGCCTGCCGCAGGTGGCCATGGCTACCGGGCAGGTGTTGTTCCAGCGGTTCTTTTATACCAAGTCCTTCGTGAAGCACTCCATGGAGCATGTGTCAATGGCCTGTGTCCACCTGGCTTCCAAGATAGAAGAGGCCCCAAGACGCATACGGGACGTCATCAATGTGTTTCACCGCCTTCGACAGCTGAGAGACAAAAAGAAGCCCGTGCCTCTACTACTGGATCAAGATTATGTTAATTTAAAGAACCAAATTATAAAGGCGGAAAGACGAGTTCTCAAAGAGTTGGGTTTCTGCGTCCATGTGAAGCATCCTCATAAGATAATCGTTATGTACCTTCAGGTGTTAGAGTGTGAGCGTAACCAACACCTGGTCCAGACCTCATGGGTAGCCTCTGAGGACCCCTTGTTGAAATGGGACAGTTGGCAGCGGCTCTGA")

711

In [None]:
# cdna - cds
2815 - 711

2104

In [24]:
# 5' UTR - checks out
1334691-1334687+1

5

In [None]:
# 3' utr - must be 2099
(1326917-1326146+1) + (1325943-1325839+1) + (1325751-1325610+1) + (1323445-1323334+1) + (1323249-1323157+1) + (1322962-1322088+1)

2099

In [29]:
import pandas as pd

gtf_path = "/Users/joeyrich/Desktop/local/varseek/enst_line.gtf"
colnames = [
    "seqname", "source", "feature", "start", "end",
    "score", "strand", "frame", "attribute"
]

df = pd.read_csv(gtf_path, sep="\t", comment='#', names=colnames, header=None)
df.head()

Unnamed: 0,seqname,source,feature,start,end,score,strand,frame,attribute
0,1,havana,transcript,1322088,1334691,.,-,.,"gene_id ""ENSG00000221978""; gene_version ""7""; t..."
1,1,havana,exon,1334399,1334691,.,-,.,"gene_id ""ENSG00000221978""; gene_version ""7""; t..."
2,1,havana,CDS,1334399,1334686,.,-,0,"gene_id ""ENSG00000221978""; gene_version ""7""; t..."
3,1,havana,start_codon,1334684,1334686,.,-,0,"gene_id ""ENSG00000221978""; gene_version ""7""; t..."
4,1,havana,exon,1333977,1334051,.,-,.,"gene_id ""ENSG00000221978""; gene_version ""7""; t..."


In [30]:
# Filter to only keep exons
exons = df[df['feature'] == 'exon'].copy()

# Calculate exon length
exons['exon_length'] = exons['end'] - exons['start'] + 1

# Sum the exon lengths
total_exon_length = exons['exon_length'].sum()

print(f"Total exon length: {total_exon_length}")


Total exon length: 2815


In [32]:
help(myfasta)

Help on Fastx object:

class Fastx(object)
 |  Methods defined here:
 |  
 |  __iter__(self, /)
 |      Implement iter(self).
 |  
 |  __next__(self, /)
 |      Implement next(self).
 |  
 |  __repr__(self, /)
 |      Return repr(self).
 |  
 |  ----------------------------------------------------------------------
 |  Static methods defined here:
 |  
 |  __new__(*args, **kwargs) from builtins.type
 |      Create and return a new object.  See help(type) for accurate signature.



In [113]:
# myfasta = pyfastx.Fastx(myfasta_file)
(myfasta).__next__()

('ENSVPAT00000000947.1',
 'GCAAGTCTCCTGGCTTGTGAAGGCCTAGCAGGTGTGAGTTTGGTTCCCACTGCAGCCAGCAAGAAGATGATGCTGAGCCAGATTGCCAGCAAGCAGGCCGAGAATGGAGAGCGGGCAGGTAGCCCTGATGTGCTGAGGTGCTCAAGTCAGGGCCACCGAAAGGACAGCGATAAGTCCCGGAGCCGCAAAGACGATGACAGCTTGGCTGAGGCCTCTCATTCAAAAAAGACTGTTAAAAAGGTGGTGGTAGTGGAACAAAATGGTTCCTTTCAAGTAAAGATTCCCAAAAATTTTGTTTGTGAACACTGCTTTGGAGCCTTTAGGAGCAGTTACCACCTCAAGAGGCACATCCTTATTCATACTGGTGAGAAGCCGTTTGAGTGTGATATATGTGATATGCGCTTCATCCAGAAGTATCACCTGGAGCGTCACAAGCGTGTGCACAGTGGTGAAAAGCCCTACCAGTGTGAACGGTGTCATCAG')

In [114]:
mylist = [1, 2, 3]

In [None]:
# it = iter(mylist)      # Get an iterator
print(next(it))

StopIteration: 

In [None]:
# myfasta_file = "/Users/joeyrich/Downloads/Vicugna_pacos.vicPac1.cdna.all.fa"
myfasta_file = "/Users/joeyrich/Downloads/Vicugna_pacos.vicPac1.dna.toplevel.fa"
import pyfastx
myfasta = pyfastx.Fastx(myfasta_file)

max_seq_length = 0
for name, seq in myfasta:
    if len(seq) > max_seq_length:
        max_seq_length = len(seq)
print(f"Maximum sequence length: {max_seq_length}")

Maximum sequence length: 94632


In [132]:
# myfasta_file = "/Users/joeyrich/Desktop/local/varseek/data/reference/ensembl_grch37_release93/Homo_sapiens.GRCh37.cdna.all.fa"
myfasta_file = "/Users/joeyrich/Desktop/local/varseek/data/reference/ensembl_grch37_release93/Homo_sapiens.GRCh37.dna.primary_assembly.fa"
import pyfastx
myfasta = pyfastx.Fastx(myfasta_file)

max_seq_length = 0
for name, seq in myfasta:
    if len(seq) > max_seq_length:
        max_seq_length = len(seq)
print(f"Maximum sequence length: {max_seq_length}")

Maximum sequence length: 249250621


5519660 vs 94632
249250621 vs 109224

In [187]:
import pandas as pd

# Simulate adata.var-like DataFrame
df = pd.DataFrame({
    'gene': ['GeneA', 'GeneB'],
    'vcrs_header': ['VAR1;VAR2', 'VAR3'],
    'other_info': [42, 99]
})

df

Unnamed: 0,gene,vcrs_header,other_info
0,GeneA,VAR1;VAR2,42
1,GeneB,VAR3,99


In [195]:
exploded = df.copy()
exploded['vcrs_header_individual'] = exploded['vcrs_header'].str.split(';')
exploded

Unnamed: 0,gene,vcrs_header,other_info,vcrs_header_individual
0,GeneA,VAR1;VAR2,42,"[VAR1, VAR2]"
1,GeneB,VAR3,99,[VAR3]


In [196]:
exploded = exploded.explode('vcrs_header_individual')
exploded

Unnamed: 0,gene,vcrs_header,other_info,vcrs_header_individual
0,GeneA,VAR1;VAR2,42,VAR1
0,GeneA,VAR1;VAR2,42,VAR2
1,GeneB,VAR3,99,VAR3


In [204]:
grouped = (
    exploded.groupby("vcrs_header", as_index=False)
    .agg({**{col: "first" for col in exploded.columns if col not in ["vcrs_header"]}})
    .reset_index(drop=True)
)
grouped['vcrs_header_list'] = grouped['vcrs_header'].str.split(';')
grouped.drop(columns=["vcrs_header_individual"], inplace=True)
grouped

Unnamed: 0,vcrs_header,gene,other_info,vcrs_header_list
0,VAR1;VAR2,GeneA,42,"[VAR1, VAR2]"
1,VAR3,GeneB,99,[VAR3]


In [None]:
import pandas as pd

bus_df_mutation = pd.DataFrame({
    'cell_id': ['A', 'B'],
    'vcrs_names': [['v1', 'v2'], ['v2', 'v3']],
    'umi': [10, 20]
})

Unnamed: 0,cell_id,vcrs_names,umi
0,A,"[v1, v2]",10
1,B,"[v2, v3]",20


In [208]:
# Step 1: Explode vcrs_names
exploded = bus_df_mutation.explode('vcrs_names', ignore_index=True)

# Step 2: Capture order of first appearance
exploded['__order__'] = exploded.groupby('vcrs_names', sort=False).ngroup()

# Step 3: Group by vcrs_names, collect other columns into lists
grouped = (exploded
           .groupby('vcrs_names', sort=False)
           .agg(lambda x: list(x))
           .reset_index())

# Step 4: Sort back to original order
grouped = grouped.sort_values('__order__').drop(columns='__order__')
grouped

Unnamed: 0,vcrs_names,cell_id,umi
0,v1,[A],[10]
1,v2,"[A, B]","[10, 20]"
2,v3,[B],[20]


In [210]:
exploded

Unnamed: 0,cell_id,vcrs_names,umi,__order__
0,A,v1,10,0
1,A,v2,10,1
2,B,v2,20,1
3,B,v3,20,2


In [221]:
merged.index

Int64Index([0, 1, 2, 3, 4, 5], dtype='int64')

In [222]:
import pandas as pd
bus_df_mutation = pd.DataFrame({
    'barcode': ['AAA', 'AAC'],
    'vcrs_names': [['v1', 'v2', 'v4', 'v5', 'v6'], ['v3']],
})

adata_var = pd.DataFrame({
    'vcrs_id': ['v1', 'v2', 'v3', 'v4', 'v5', 'v6'],
    'gene_id': ['gene1', 'gene2', 'gene2', 'gene4', 'gene5', 'gene6'],
})

# Step 1: Explode the vcrs_names column in bus_df_mutation
exploded = bus_df_mutation.explode('vcrs_names', ignore_index=True)

# Step 2: Merge with adata_var on vcrs_names == vcrs_id
merged = exploded.merge(
    adata_var[['vcrs_id', 'gene_id']],
    left_on='vcrs_names',
    right_on='vcrs_id',
    how='left'
)

# Step 3: Group back by original rows using original row index
merged['original_index'] = merged.index.map(
    lambda i: 0 if i < len(bus_df_mutation.loc[0, 'vcrs_names']) else 1
)

# Step 4: Aggregate back to list
bus_df_mutation = (merged
          .groupby('original_index')
          .agg({
              'barcode': 'first',
              'vcrs_names': lambda x: list(x),
              'gene_id': lambda x: list(x)
          })
          .reset_index(drop=True)
)

bus_df_mutation

Unnamed: 0,barcode,vcrs_names,gene_id
0,AAA,"[v1, v2, v4, v5, v6]","[gene1, gene2, gene4, gene5, gene6]"
1,AAC,[v3],[gene2]


In [218]:
grouped

Unnamed: 0,vcrs_names,gene_id
0,"[v1, v2]","[gene1, gene2]"
1,"[v4, v5]","[gene4, gene5]"
2,"[v6, v3]","[gene6, gene2]"


In [217]:
merged

Unnamed: 0,barcode,vcrs_names,vcrs_id,gene_id
0,AAA,v1,v1,gene1
1,AAA,v2,v2,gene2
2,AAA,v4,v4,gene4
3,AAA,v5,v5,gene5
4,AAA,v6,v6,gene6
5,AAC,v3,v3,gene2
