# Extract cancer variant reads for alignment visualization

## In this notebook, we use a single RNA-seq fastq file from a melanoma cancer cell line from the the CCLE project. Learn more about the project here:
- sequencing data (ENA): https://www.ebi.ac.uk/ena/browser/view/PRJNA523380
- paper: https://www.nature.com/articles/s41586-019-1186-3

### Requirements: kb, samtools, and bowtie2

In [1]:
try:
    import varseek as vk
except ImportError:
    print("varseek not found, installing...")
    !pip install -U -q varseek
try:
    import RLSRWP_2025
except ImportError:
    print("RLSRWP_2025 not found, installing...")
    !pip install -q git+https://github.com/pachterlab/RLSRWP_2025.git

In [2]:
import anndata
import os
import numpy as np
import pandas as pd
import gget
import pyfastx
import glob
# pd.set_option('display.max_columns', None)
import varseek as vk
from varseek.utils import make_bus_df

RLSRWP_2025_dir = os.path.dirname(os.path.abspath(""))  # if this notebook resides in RLSRWP_2025/notebooks/0_data_download.ipynb, then this retrieves RLSRWP_2025
threads = "2"
use_head = False

### File path definitions and imports

In [3]:
vk_count_out_dir = os.path.join(RLSRWP_2025_dir, "data", "varseek_count_out_alignment_visualization_full")
kb_count_out_dir = os.path.join(vk_count_out_dir, "kb_count_out_vcrs")
adata_path = os.path.join(kb_count_out_dir, "counts_unfiltered", "adata.h5ad")
aligned_reads_parent_dir = os.path.join(vk_count_out_dir, "pseudoaligned_reads_to_vcrs_reference")
bowtie_read_alignments = os.path.join(vk_count_out_dir, "bowtie_read_alignments")

# vk ref out directory and files - downloaded if not already present
vk_ref_out_dir = os.path.join(RLSRWP_2025_dir, "data", "vk_ref_out")
vcrs_index = os.path.join(vk_ref_out_dir, "vcrs_index.idx")
vcrs_t2g = os.path.join(vk_ref_out_dir, "vcrs_t2g_filtered.txt")
vcrs_fasta = os.path.join(vk_ref_out_dir, "vcrs_filtered.fa")

# fastq directories - fastqs_dir downloaded if not already present, and fastqs_processed_dir created with fastp if not already present
fastqs_dir = os.path.join(RLSRWP_2025_dir, "data", "ccle_data_base", "RNASeq_MELHO_SKIN")
fastq_file = os.path.join(fastqs_dir, "SRR8615233_1.fastq.gz")
technology = "bulk"

# cosmic directory
cosmic_dir = os.path.join(RLSRWP_2025_dir, "data", "reference", "cosmic")
cosmic_csv = os.path.join(cosmic_dir, "CancerMutationCensus_AllData_Tsv_v101_GRCh37", "CancerMutationCensus_AllData_v101_GRCh37_mutation_workflow.csv")

# reference genome directory
reference_genome_dir = os.path.join(RLSRWP_2025_dir, "data", "reference", "ensembl_grch37_release93")

# kb count to reference genome directory and files - created if not already present - only used if qc_against_gene_matrix=True
qc_against_gene_matrix = False
kb_count_reference_genome_dir = os.path.join(RLSRWP_2025_dir, "data", "kb_count_reference_genome")
reference_genome_index = os.path.join(reference_genome_dir, "index.idx")  # either already exists or will be created
reference_genome_t2g = os.path.join(reference_genome_dir, "t2g.txt")  # either already exists or will be created
reference_genome_fasta = os.path.join(reference_genome_dir, "Homo_sapiens.GRCh37.dna.primary_assembly.fa")  # if reference_genome_index/reference_genome_t2g do not exist, then I need to supply the reference genome fasta and gtf
reference_genome_gtf = os.path.join(reference_genome_dir, "Homo_sapiens.GRCh37.87.gtf")  # if reference_genome_index/reference_genome_t2g do not exist, then I need to supply the reference genome fasta and gtf

# for bowtie2 - also uses reference_genome_fasta above (I could use cdna instead of genome too if desired)
bowtie_reference_dir = os.path.join(reference_genome_dir, "bowtie_index_genome")
bowtie_reference_prefix = os.path.join(bowtie_reference_dir, "index")

# general
w = "47"  # used during creation of the index, so cannot be altered
k = "51"  # used during creation of the index, so cannot be altered
strand = "unstranded"
parity = "single"  # although the original data is paired, we will only be using a single file, so we will run in single-end mode

# software
bustools = "/Users/joeyrich/miniconda3/envs/RLSRWP_2025/lib/python3.10/site-packages/kb_python/bins/darwin/m1/bustools/bustools"
bowtie2 = "bowtie2"
bowtie2_build = "bowtie2-build"
samtools = "samtools"

### Download VCRS reference files with varseek ref

In [4]:
if not os.path.exists(vcrs_index) or not os.path.exists(vcrs_t2g) or not os.path.exists(vcrs_fasta):
    vk.ref(variants="cosmic_cmc", sequences="cdna", w=w, k=k, dlist_reference_source="t2t", index_out=vcrs_index, t2g_out=vcrs_t2g, fasta_out=vcrs_fasta, download=True)

### Download COSMIC if not already downloaded

In [5]:
if cosmic_csv and not os.path.isfile(cosmic_csv):
    gget.cosmic(
        None,
        grch_version=37,
        cosmic_version=101,
        out=cosmic_dir,
        mutation_class="cancer",
        download_cosmic=True,
        keep_genome_info=True,
        remove_duplicates=True
    )

cosmic_df = pd.read_csv(cosmic_csv)

if "mutation_cdna" not in cosmic_df.columns:
    reference_cdna_path = os.path.join(reference_genome_dir, "Homo_sapiens.GRCh37.cdna.all.fa")
    reference_cds_path = os.path.join(reference_genome_dir, "Homo_sapiens.GRCh37.cds.all.fa")
    if not os.path.exists(reference_cdna_path):
        reference_cdna_dir = os.path.dirname(reference_cdna_path) if os.path.dirname(reference_cdna_path) else "."
        !gget ref -w cdna -r 93 --out_dir {reference_cdna_dir} -d human_grch37
        !gunzip {reference_cdna_path}.gz
    if not os.path.exists(reference_cds_path):
        reference_cds_dir = os.path.dirname(reference_cds_path) if os.path.dirname(reference_cds_path) else "."
        !gget ref -w cdna -r 93 --out_dir {reference_cds_dir} -d human_grch37
        !gunzip {reference_cds_path}.gz
    print("Converting CDS to cDNA in COSMIC")
    cosmic_df, _ = vk.utils.convert_mutation_cds_locations_to_cdna(input_csv_path=cosmic_df, output_csv_path=cosmic_csv, cds_fasta_path=reference_cds_path, cdna_fasta_path=reference_cdna_path, verbose=True)

if "header" not in cosmic_df.columns:
    cosmic_df["header"] = cosmic_df["seq_ID"] + ":" + cosmic_df["mutation_cdna"]
    cosmic_df.to_csv(cosmic_csv, index=False)

cosmic_df.head()

  cosmic_df = pd.read_csv(cosmic_csv)


Unnamed: 0,seq_ID,mutation,mutation_aa,GENOMIC_MUTATION_ID,chromosome,strand,mutation_genome,gene_name,mutation_id,mutation_cdna,header
0,ENST00000396153,c.1468C>T,p.Q490*,COSV105286190,1,-,g.51001067G>A,FAF1,48293736,c.1920C>T,ENST00000396153:c.1920C>T
1,ENST00000445907,c.162C>A,p.L54=,COSV57765075,7,+,g.136699774C>A,CHRM2,60222372,c.690C>A,ENST00000445907:c.690C>A
2,ENST00000445907,c.617A>T,p.Y206F,COSV108138517,7,+,g.136700229A>T,CHRM2,60229878,c.1145A>T,ENST00000445907:c.1145A>T
3,ENST00000445907,c.93A>T,p.G31=,COSV100282253,7,+,g.136699705A>T,CHRM2,60227007,c.621A>T,ENST00000445907:c.621A>T
4,ENST00000445907,c.956A>C,p.N319T,COSV108138508,7,+,g.136700568A>C,CHRM2,60213402,c.1484A>C,ENST00000445907:c.1484A>C


### Make bowtie2 index files

In [6]:
if not os.path.exists(bowtie_reference_dir) or len(os.listdir(bowtie_reference_dir)) == 0:
    os.makedirs(bowtie_reference_dir, exist_ok=True)
    !{bowtie2_build} --threads {threads} {reference_genome_fasta} {bowtie_reference_prefix}

### Download fastq file

In [7]:
if not os.path.isfile(fastq_file):
    fastq_file_link = "ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR861/003/SRR8615233/SRR8615233_1.fastq.gz"  # ["ftp.sra.ebi.ac.uk/vol1/fastq/SRR861/003/SRR8615233/SRR8615233_1.fastq.gz", "ftp.sra.ebi.ac.uk/vol1/fastq/SRR861/003/SRR8615233/SRR8615233_2.fastq.gz"]
    os.makedirs(fastqs_dir, exist_ok=True)
    !wget -c --tries=5 --retry-connrefused -O {fastq_file} {fastq_file_link}

--2025-03-21 09:14:25--  ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR861/003/SRR8615233/SRR8615233_1.fastq.gz
           => ‘/Users/joeyrich/Desktop/local/RLSRWP_2025/data/ccle_data_base/RNASeq_MELHO_SKIN/SRR8615233_1.fastq.gz’
Resolving ftp.sra.ebi.ac.uk (ftp.sra.ebi.ac.uk)... 193.62.193.165
Connecting to ftp.sra.ebi.ac.uk (ftp.sra.ebi.ac.uk)|193.62.193.165|:21... connected.
Logging in as anonymous ... Logged in!
==> SYST ... done.    ==> PWD ... done.
==> TYPE I ... done.  ==> CWD (1) /vol1/fastq/SRR861/003/SRR8615233 ... done.
==> SIZE SRR8615233_1.fastq.gz ... 3728501472
==> PASV ... done.    ==> RETR SRR8615233_1.fastq.gz ... done.
Length: 3728501472 (3.5G) (unauthoritative)


2025-03-21 09:46:01 (820 KB/s) - Data connection: Operation timed out; Control connection closed.
Retrying.

--2025-03-21 10:01:02--  ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR861/003/SRR8615233/SRR8615233_1.fastq.gz
  (try: 2) => ‘/Users/joeyrich/Desktop/local/RLSRWP_2025/data/ccle_data_base/RNASeq_MELHO_SKIN/SRR8615

In [8]:
if use_head:
    fastq_file_head = fastq_file.replace(".fastq.gz", "_head.fastq")
    if not os.path.isfile(fastq_file_head):
        !zcat {fastq_file} | head -1000000 > $fastq_file_head
    fastq_file = fastq_file_head

In [9]:
!head {fastq_file}

MJ�mQ>�#K���X�U�n�������>�CQ�������<����e��
&.�`.�]���^��rq�3�s�
a��柀ߏ������9�����6�_	g������#>^�cz]>�_�c�g��
�/���C��Jw�/e�W�G��|�8�9����{Do=s�f��oZI3]?·@�i���gǫ<�PZ�"ɬ��C��|Ŧ �.��@A�b�}�W�:�ï��Je��fһ���'�{�v��Hx���N�Y>&�$%	ÄP����F�ⵇ�A8e(ŒerV(��G|>*���ܔ�?x����8$i a�l^��9|.QG�6CZ4����R���'�]�/\�'��!$]��4���o$-�[2��ȨD��$4�g��Ym*�����3��^)J�/��G:b���//k��DiX����~�/���_��&����0oP�f�zހ ����U֝]<��j�l5\��6����r5�ү�;n�t���]�e�00�EdMXc������[�?C���Z">��+�xo���z]���{��ڻO'[һ��)�OzV�6'��@��!G�S��Y��Vѫ�7^���%g��a��)��?���-�Kz9�.�w�����ԺT���@i҆a�Z4 ���ӹ�t�m�uu��y�FW�J�(�f��m�z-j��n%_��I�����/��7}3ZR�'�=4D$�����d0�~a�}o����oRp?��໣d<@�xӬN�FK!��=h��6hbV�5�&��Pql=sV��3���}@��j�d��(�_b>�p��84(�3<���v-\UmDݬ���6�J��3H�=JG�=���7g�؏&�:�����Oa�s"�����g����&Q���JM|�����ٳ�Oo{�؆,�a����T�Y�����g�H�*�'�"�No'zĈ_XW����#�y�͋'���	�s�Mtb�>��������J٣}tp֜����ZJ�?+)�w���K%;�)��l�e-�0��mY�w���

### Make pyfastx index files

In [10]:
fastq_indexed = pyfastx.Fastq(fastq_file, build_index=True)
vcrs_fasta_indexed = pyfastx.Fasta(vcrs_fasta, build_index=True)

### Make kb reference files if qc_against_gene_matrix=True

In [11]:
kb_count_reference_genome_adata = os.path.join(kb_count_reference_genome_dir, "counts_unfiltered", "adata.h5ad")
if qc_against_gene_matrix and not os.path.exists(kb_count_reference_genome_adata):  # check if kb count was run
    os.makedirs(kb_count_reference_genome_dir, exist_ok=True)
    if not os.path.exists(reference_genome_index) or not os.path.exists(reference_genome_t2g):  # check if kb ref was run
        if not os.path.exists(reference_genome_fasta) or not os.path.exists(reference_genome_gtf):
            reference_genome_out_dir = os.path.dirname(reference_genome_fasta) if reference_genome_fasta else "."
            # using grch37, ensembl 93 to agree with COSMIC
            !gget ref -w dna,gtf -r 93 --out_dir {reference_genome_out_dir} -d human_grch37 && gunzip {reference_genome_fasta}.gz && gunzip {reference_genome_gtf}.gz
        reference_genome_f1 = os.path.join(kb_count_reference_genome_dir, "f1.fa")
        !kb ref -t {threads} -i {reference_genome_index} -g {reference_genome_t2g} -f1 {reference_genome_f1} {reference_genome_fasta} {reference_genome_gtf}

### Perform variant screening with varseek count

In [12]:
if not os.path.exists(adata_path):
    vk_count_output_dict = vk.count(
        fastq_file,
        index=vcrs_index,
        t2g=vcrs_t2g,
        technology=technology,
        out=vk_count_out_dir,
        k=k,
        strand=strand,
        parity=parity,
        threads=threads,
        disable_fastqpp=True,
        disable_clean=True,
        disable_summarize=True
        # quality_control_fastqs=True, cut_front=True, cut_tail=True  # equivalent to the fastp step above
        # qc_against_gene_matrix=qc_against_gene_matrix, reference_genome_index=reference_genome_index, reference_genome_t2g=reference_genome_t2g,
    )

10:04:58 - INFO - Setting length_required to 51 if fastqpp is run
10:04:58 - INFO - Skipping vk fastqpp because disable_fastqpp=True
10:04:58 - INFO - Running kb count with command: kb count -t 2 -k 51 -i /Users/joeyrich/Desktop/local/RLSRWP_2025/data/vk_ref_out/vcrs_index.idx -g /Users/joeyrich/Desktop/local/RLSRWP_2025/data/vk_ref_out/vcrs_t2g_filtered.txt -x BULK --h5ad -o /Users/joeyrich/Desktop/local/RLSRWP_2025/data/varseek_count_out_alignment_visualization_full/kb_count_out_vcrs --overwrite --strand unstranded --parity single --num /Users/joeyrich/Desktop/local/RLSRWP_2025/data/ccle_data_base/RNASeq_MELHO_SKIN/SRR8615233_1.fastq.gz
[2025-03-21 10:05:05,199]    INFO [count] Using index /Users/joeyrich/Desktop/local/RLSRWP_2025/data/vk_ref_out/vcrs_index.idx to generate BUS file to /Users/joeyrich/Desktop/local/RLSRWP_2025/data/varseek_count_out_alignment_visualization_full/kb_count_out_vcrs from
[2025-03-21 10:05:05,200]    INFO [count]         /Users/joeyrich/Desktop/local/RLSRW

### Load in adata object

In [13]:
adata = anndata.read_h5ad(adata_path)
adata

AnnData object with n_obs × n_vars = 1 × 5329695

### View top 10 variants by total reads aligned

In [14]:
top10 = list(adata.var.index[np.argsort(np.array(adata.X.todense())[0])][::-1][:10])
# Create a dictionary mapping gene names to counts
top10_counts = np.array(adata.X.todense())[0][np.argsort(np.array(adata.X.todense())[0])][::-1][:10].astype(int)
top10_dict = dict(zip(top10, top10_counts))
top10_dict

{'ENST00000361381:c.492A>G': 129694,
 'ENST00000361381:c.952G>A': 38624,
 'ENST00000361681:c.351C>T': 19182,
 'ENST00000361681:c.441T>C': 14366,
 'ENST00000398606:c.562A>G': 13934,
 'ENST00000449260:c.1988C>T': 12589,
 'ENST00000251453:c.80T>G': 11981,
 'ENST00000398606:c.804T>C': 11300,
 'ENST00000343262:c.156C>T': 6296,
 'ENST00000314138:c.730C>T': 4712}

### Map the reads to the VCRS to which they aligned

In [27]:
kb_count_out_dir = '/Users/joeyrich/Desktop/local/RLSRWP_2025/data/varseek_count_out_alignment_visualization_full/kb_count_out_vcrs'
fastq_file = '/Users/joeyrich/Desktop/local/RLSRWP_2025/data/ccle_data_base/RNASeq_MELHO_SKIN/SRR8615233_1.fastq.gz'
technology = 'bulk'
parity = 'single'
bustools = '/Users/joeyrich/miniconda3/envs/RLSRWP_2025/lib/python3.10/site-packages/kb_python/bins/darwin/m1/bustools/bustools'

import importlib
import varseek.utils.varseek_clean_utils
importlib.reload(varseek.utils.varseek_clean_utils)
from varseek.utils.varseek_clean_utils import make_bus_df

bus_df = make_bus_df(kb_count_out_dir, fastq_file, technology=technology, parity=parity, bustools=bustools)

loading in transcripts
loading in barcodes


Processing FASTQ headers: 69015721it [00:55, 1238762.33it/s]


loading in ec matrix
loading in bus df
Merging fastq header df and ec_df into bus df
Determining what counts in count matrix


100%|██████████| 893751/893751 [00:00<00:00, 2372827.07it/s]


Saving bus df as parquet to /Users/joeyrich/Desktop/local/RLSRWP_2025/data/varseek_count_out_alignment_visualization_full/kb_count_out_vcrs/bus_df.parquet


In [31]:
bus_df.head(3)

Unnamed: 0,barcode,UMI,EC,read_index,fastq_header,transcript_names,file_index,gene_names,counted_in_count_matrix
0,AAAAAAAAAAAAAAAA,T,0,588,SRR8615233.589,"(ENST00000326592:c.1538A>C,)",0,"(ENST00000326592:c.1538A>C,)",True
1,AAAAAAAAAAAAAAAA,T,1,656,SRR8615233.657,"(ENST00000326592:c.1474A>C,)",0,"(ENST00000326592:c.1474A>C,)",True
2,AAAAAAAAAAAAAAAA,T,2,886,SRR8615233.887,"(ENST00000326592:c.1235C>T,)",0,"(ENST00000326592:c.1235C>T,)",True


In [None]:
bus_df.to_csv("bus_df_test.csv", index=False)
bus_df_copy = bus_df.copy()
bus_df_copy["transcript_names"] = bus_df_copy["transcript_names"].apply(list)
bus_df_copy["gene_names"] = bus_df_copy["gene_names"].apply(list)
bus_df_copy.to_parquet("bus_df_test.parquet", index=False)

In [None]:
bus_df_copy_from_csv = pd.read_csv("bus_df_test.csv")
print(type(bus_df_copy_from_csv['transcript_names'][0]))
bus_df_copy_from_csv.head()

<class 'str'>


Unnamed: 0,barcode,UMI,EC,read_index,fastq_header,transcript_names,file_index,gene_names,counted_in_count_matrix
0,AAAAAAAAAAAAAAAA,T,0,588,SRR8615233.589,"('ENST00000326592:c.1538A>C',)",0,"('ENST00000326592:c.1538A>C',)",True
1,AAAAAAAAAAAAAAAA,T,1,656,SRR8615233.657,"('ENST00000326592:c.1474A>C',)",0,"('ENST00000326592:c.1474A>C',)",True
2,AAAAAAAAAAAAAAAA,T,2,886,SRR8615233.887,"('ENST00000326592:c.1235C>T',)",0,"('ENST00000326592:c.1235C>T',)",True
3,AAAAAAAAAAAAAAAA,T,2,997,SRR8615233.998,"('ENST00000326592:c.1235C>T',)",0,"('ENST00000326592:c.1235C>T',)",True
4,AAAAAAAAAAAAAAAA,T,3,4353,SRR8615233.4354,"('ENST00000361390:c.518G>A',)",0,"('ENST00000361390:c.518G>A',)",True


In [None]:
def parquet_column_list_to_tuple(df):
    for col in df.columns:
        first_value = df[col].dropna().iloc[0] if not df[col].dropna().empty else None
        if isinstance(first_value, list):
            df[col] = df[col].apply(tuple)
def parquet_column_tuple_to_list(df):
    for col in df.columns:
        first_value = df[col].dropna().iloc[0] if not df[col].dropna().empty else None
        if isinstance(first_value, tuple):
            df[col] = df[col].apply(list)

In [79]:
bus_df_copy4.head().to_parquet("bus_df_test4_chunked.parquet", index=False, append=False)
bus_df_copy4.tail().to_parquet("bus_df_test4_chunked.parquet", index=False, append=True)
bus_df_copy4_chunked = pd.read_parquet("bus_df_test4_chunked.parquet")
bus_df_copy4_chunked

Unnamed: 0,barcode,UMI,EC,read_index,fastq_header,transcript_names,file_index,gene_names,counted_in_count_matrix
0,AAAAAAAAAAAAAAAA,T,0,588,SRR8615233.589,[ENST00000326592:c.1538A>C],0,[ENST00000326592:c.1538A>C],True
1,AAAAAAAAAAAAAAAA,T,1,656,SRR8615233.657,[ENST00000326592:c.1474A>C],0,[ENST00000326592:c.1474A>C],True
2,AAAAAAAAAAAAAAAA,T,2,886,SRR8615233.887,[ENST00000326592:c.1235C>T],0,[ENST00000326592:c.1235C>T],True
3,AAAAAAAAAAAAAAAA,T,2,997,SRR8615233.998,[ENST00000326592:c.1235C>T],0,[ENST00000326592:c.1235C>T],True
4,AAAAAAAAAAAAAAAA,T,3,4353,SRR8615233.4354,[ENST00000361390:c.518G>A],0,[ENST00000361390:c.518G>A],True
5,AAAAAAAAAAAAAAAA,T,155680,68998653,SRR8615233.68998654,[ENST00000379757:c.913C>T],0,[ENST00000379757:c.913C>T],True
6,AAAAAAAAAAAAAAAA,T,155681,68998654,SRR8615233.68998655,[ENST00000431877:c.2438T>A],0,[ENST00000431877:c.2438T>A],True
7,AAAAAAAAAAAAAAAA,T,155682,68998827,SRR8615233.68998828,[ENST00000262027:c.2692G>A],0,[ENST00000262027:c.2692G>A],True
8,AAAAAAAAAAAAAAAA,T,155683,68999127,SRR8615233.68999128,[ENST00000429344:c.963C>T],0,[ENST00000429344:c.963C>T],True
9,AAAAAAAAAAAAAAAA,T,155684,68999166,SRR8615233.68999167,[ENST00000485511:c.609G>T],0,[ENST00000485511:c.609G>T],True


In [74]:
save_type = "parquet"
bus_df_copy4 = bus_df.copy()

if save_type == "parquet":
    parquet_column_tuple_to_list(bus_df_copy4)
    bus_df_copy4.head()
    bus_df_copy4.to_parquet("bus_df_test.parquet", index=False)

    bus_df_copy4_from_parquet = pd.read_parquet("bus_df_test.parquet")
    parquet_column_list_to_tuple(bus_df_copy4_from_parquet)
    bus_df_copy4_from_parquet.head()

In [77]:
bus_df.head()

Unnamed: 0,barcode,UMI,EC,read_index,fastq_header,transcript_names,file_index,gene_names,counted_in_count_matrix
0,AAAAAAAAAAAAAAAA,T,0,588,SRR8615233.589,"(ENST00000326592:c.1538A>C,)",0,"(ENST00000326592:c.1538A>C,)",True
1,AAAAAAAAAAAAAAAA,T,1,656,SRR8615233.657,"(ENST00000326592:c.1474A>C,)",0,"(ENST00000326592:c.1474A>C,)",True
2,AAAAAAAAAAAAAAAA,T,2,886,SRR8615233.887,"(ENST00000326592:c.1235C>T,)",0,"(ENST00000326592:c.1235C>T,)",True
3,AAAAAAAAAAAAAAAA,T,2,997,SRR8615233.998,"(ENST00000326592:c.1235C>T,)",0,"(ENST00000326592:c.1235C>T,)",True
4,AAAAAAAAAAAAAAAA,T,3,4353,SRR8615233.4354,"(ENST00000361390:c.518G>A,)",0,"(ENST00000361390:c.518G>A,)",True


In [75]:
bus_df_copy4.head()

Unnamed: 0,barcode,UMI,EC,read_index,fastq_header,transcript_names,file_index,gene_names,counted_in_count_matrix
0,AAAAAAAAAAAAAAAA,T,0,588,SRR8615233.589,[ENST00000326592:c.1538A>C],0,[ENST00000326592:c.1538A>C],True
1,AAAAAAAAAAAAAAAA,T,1,656,SRR8615233.657,[ENST00000326592:c.1474A>C],0,[ENST00000326592:c.1474A>C],True
2,AAAAAAAAAAAAAAAA,T,2,886,SRR8615233.887,[ENST00000326592:c.1235C>T],0,[ENST00000326592:c.1235C>T],True
3,AAAAAAAAAAAAAAAA,T,2,997,SRR8615233.998,[ENST00000326592:c.1235C>T],0,[ENST00000326592:c.1235C>T],True
4,AAAAAAAAAAAAAAAA,T,3,4353,SRR8615233.4354,[ENST00000361390:c.518G>A],0,[ENST00000361390:c.518G>A],True


In [78]:
bus_df_copy4_from_parquet.head()

Unnamed: 0,barcode,UMI,EC,read_index,fastq_header,transcript_names,file_index,gene_names,counted_in_count_matrix
0,AAAAAAAAAAAAAAAA,T,0,588,SRR8615233.589,"(ENST00000326592:c.1538A>C,)",0,"(ENST00000326592:c.1538A>C,)",True
1,AAAAAAAAAAAAAAAA,T,1,656,SRR8615233.657,"(ENST00000326592:c.1474A>C,)",0,"(ENST00000326592:c.1474A>C,)",True
2,AAAAAAAAAAAAAAAA,T,2,886,SRR8615233.887,"(ENST00000326592:c.1235C>T,)",0,"(ENST00000326592:c.1235C>T,)",True
3,AAAAAAAAAAAAAAAA,T,2,997,SRR8615233.998,"(ENST00000326592:c.1235C>T,)",0,"(ENST00000326592:c.1235C>T,)",True
4,AAAAAAAAAAAAAAAA,T,3,4353,SRR8615233.4354,"(ENST00000361390:c.518G>A,)",0,"(ENST00000361390:c.518G>A,)",True


In [None]:
bus_df_copy_from_parquet = pd.read_parquet("bus_df_test.parquet")
print(type(bus_df_copy_from_parquet['transcript_names'][0]))
bus_df_copy_from_parquet.head()

<class 'list'>


Unnamed: 0,barcode,UMI,EC,read_index,fastq_header,transcript_names,file_index,gene_names,counted_in_count_matrix
0,AAAAAAAAAAAAAAAA,T,0,588,SRR8615233.589,[ENST00000326592:c.1538A>C],0,[ENST00000326592:c.1538A>C],True
1,AAAAAAAAAAAAAAAA,T,1,656,SRR8615233.657,[ENST00000326592:c.1474A>C],0,[ENST00000326592:c.1474A>C],True
2,AAAAAAAAAAAAAAAA,T,2,886,SRR8615233.887,[ENST00000326592:c.1235C>T],0,[ENST00000326592:c.1235C>T],True
3,AAAAAAAAAAAAAAAA,T,2,997,SRR8615233.998,[ENST00000326592:c.1235C>T],0,[ENST00000326592:c.1235C>T],True
4,AAAAAAAAAAAAAAAA,T,3,4353,SRR8615233.4354,[ENST00000361390:c.518G>A],0,[ENST00000361390:c.518G>A],True


In [58]:
import json
bus_df_copy_with_json = bus_df.copy()
bus_df_copy_with_json["transcript_names"] = bus_df["transcript_names"].apply(json.dumps)
bus_df_copy_with_json["gene_names"] = bus_df["gene_names"].apply(json.dumps)
bus_df_copy_with_json.to_parquet("bus_df_json.parquet", index=False)
!du -sh /Users/joeyrich/Desktop/local/RLSRWP_2025/notebooks/bus_df_json.parquet

282128.36s - pydevd: Sending message related to process being replaced timed-out after 5 seconds


 18M	/Users/joeyrich/Desktop/local/RLSRWP_2025/notebooks/bus_df_json.parquet


In [64]:
bus_df["transcript_names"][0]

('ENST00000326592:c.1538A>C',)

In [None]:
bus_df_copy_with_json["transcript_names"]

str

In [25]:
bus_df = make_bus_df(kb_count_out_dir, fastq_file, technology=technology, parity=parity, bustools=bustools)
bus_df = bus_df.drop(columns=["transcript_names"]).rename(columns={"gene_names": "variant_names"})
filtered_bus_df = bus_df[bus_df["variant_names"].apply(lambda x: len(x) == 1)]  # remove multi-mapping reads
filtered_bus_df["variant_names_str"] = filtered_bus_df["variant_names"].apply(lambda x: x[0])  # cast to string
filtered_bus_df = filtered_bus_df[(filtered_bus_df["variant_names_str"].isin(top10)) & (filtered_bus_df["counted_in_count_matrix"])]
print(f"Number of reads in filtered bus file: {len(filtered_bus_df)}")
filtered_bus_df.head()

loading in transcripts
loading in barcodes


Processing FASTQ headers: 69015721it [00:59, 1164336.11it/s]


loading in ec matrix
loading in bus df
Merging fastq header df and ec_df into bus df
Determining what counts in count matrix


100%|██████████| 893751/893751 [00:00<00:00, 1797477.84it/s]


Saving bus df as parquet to /Users/joeyrich/Desktop/local/RLSRWP_2025/data/varseek_count_out_alignment_visualization_full/kb_count_out_vcrs/bus_df.parquet


ValueError: Can't infer object conversion type: 0         (ENST00000326592:c.1538A>C,)
1         (ENST00000326592:c.1474A>C,)
2         (ENST00000326592:c.1235C>T,)
3         (ENST00000326592:c.1235C>T,)
4          (ENST00000361390:c.518G>A,)
                      ...             
893746     (ENST00000379757:c.913C>T,)
893747    (ENST00000431877:c.2438T>A,)
893748    (ENST00000262027:c.2692G>A,)
893749     (ENST00000429344:c.963C>T,)
893750     (ENST00000485511:c.609G>T,)
Name: transcript_names, Length: 893751, dtype: object

In [None]:
# make IDs since the gene names in HGVSC format do not make for good folder names or bowtie2 header names
vcrs_header_to_id = {vcrs: f"vcrs_{i}" for i, vcrs in enumerate(list(top10))}
filtered_bus_df["vcrs_ids"] = filtered_bus_df["variant_names_str"].map(vcrs_header_to_id)

vcrs_header_to_id_file = os.path.join(vk_count_out_dir, "vcrs_header_to_id.txt")
with open(vcrs_header_to_id_file, "w") as f:
    for key, value in vcrs_header_to_id.items():
        f.write(f"{key}\t{value}\n")  # Tab-separated

id_to_vcrs_header = {v: k for k, v in vcrs_header_to_id.items()}
vcrs_header_to_id

{'ENST00000361624:c.795dup': 'vcrs_0',
 'ENST00000361390:c.910T>C': 'vcrs_1',
 'ENST00000361453:c.142dup': 'vcrs_2',
 'ENST00000361899:c.41T>C': 'vcrs_3',
 'ENST00000361899:c.268C>T': 'vcrs_4',
 'ENST00000361899:c.328G>A': 'vcrs_5',
 'ENST00000361899:c.326G>A': 'vcrs_6',
 'ENST00000361624:c.922G>A': 'vcrs_7',
 'ENST00000361624:c.226G>A': 'vcrs_8',
 'ENST00000361899:c.296C>T': 'vcrs_9'}

In [None]:
for vcrs_id in filtered_bus_df["vcrs_ids"].unique():  # Get unique gene names
    temp_df = filtered_bus_df[filtered_bus_df["vcrs_ids"] == vcrs_id]  # Filter
    fastq_headers = temp_df["fastq_header"].tolist()  # Get values as a list

    gene_dir = os.path.join(aligned_reads_parent_dir, vcrs_id)
    os.makedirs(gene_dir, exist_ok=True)
    
    aligned_reads_file = os.path.join(gene_dir, "1.fastq")
    with open(aligned_reads_file, "w") as f:
        for header in fastq_headers:
            sequence = fastq_indexed[header].seq
            qualities = fastq_indexed[header].qual
            f.write(f"@{header}\n{sequence}\n+\n{qualities}\n")

### View COSMIC metadata

In [None]:
cosmic_df_subset = cosmic_df[cosmic_df["header"].isin(top10)]
cosmic_df_subset["id"] = cosmic_df_subset["header"].map(vcrs_header_to_id)
cosmic_df_subset["counts"] = cosmic_df_subset["header"].map(lambda x: top10_dict[x])
cosmic_df_subset = cosmic_df_subset.sort_values(by="counts", ascending=False)
cosmic_df_subset

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cosmic_df_subset["id"] = cosmic_df_subset["header"].map(vcrs_header_to_id)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cosmic_df_subset["counts"] = cosmic_df_subset["header"].map(lambda x: top10_dict[x])


Unnamed: 0,seq_ID,mutation,mutation_aa,GENOMIC_MUTATION_ID,chromosome,strand,mutation_genome,gene_name,mutation_id,mutation_cdna,header,id,counts
5349419,ENST00000361624,c.795dup,p.E266Rfs*?,COSV104419744,MT,,g.6698_6699dup,MT-CO1,36495429,c.795dup,ENST00000361624:c.795dup,vcrs_0,627
1078993,ENST00000361390,c.910T>C,p.Y304H,COSV62293824,MT,+,g.4216T>C,MT-ND1,35322641,c.910T>C,ENST00000361390:c.910T>C,vcrs_1,536
5268715,ENST00000361453,c.142dup,p.M48Nfs*?,COSV107450120,MT,,g.4611_4612dup,MT-ND2,35576559,c.142dup,ENST00000361453:c.142dup,vcrs_2,187
835480,ENST00000361899,c.41T>C,p.I14T,COSV106107919,MT,+,g.8567T>C,MT-ATP6,38183749,c.41T>C,ENST00000361899:c.41T>C,vcrs_3,69
835485,ENST00000361899,c.268C>T,p.H90Y,COSV62293579,MT,+,g.8794C>T,MT-ATP6,38184577,c.268C>T,ENST00000361899:c.268C>T,vcrs_4,15
835511,ENST00000361899,c.328G>A,p.A110T,COSV106107895,MT,+,g.8854G>A,MT-ATP6,38184385,c.328G>A,ENST00000361899:c.328G>A,vcrs_5,12
835482,ENST00000361899,c.326G>A,p.W109*,COSV106107946,MT,+,g.8852G>A,MT-ATP6,38184768,c.326G>A,ENST00000361899:c.326G>A,vcrs_6,11
3261302,ENST00000361624,c.922G>A,p.A308T,COSV106107908,MT,+,g.6825G>A,MT-CO1,36495787,c.922G>A,ENST00000361624:c.922G>A,vcrs_7,10
3261422,ENST00000361624,c.226G>A,p.G76*,COSV106107894,MT,+,g.6129G>A,MT-CO1,36495476,c.226G>A,ENST00000361624:c.226G>A,vcrs_8,10
835504,ENST00000361899,c.296C>T,p.S99F,COSV62294449,MT,+,g.8822C>T,MT-ATP6,38184051,c.296C>T,ENST00000361899:c.296C>T,vcrs_9,9


### Align (1) pulled out reads and (2) the VCRSs to the human genome to generate bam files:

In [None]:
for folder in sorted(glob.glob(aligned_reads_parent_dir + "/*/")):
    variant = folder.split("/")[-2]
    variant_header = id_to_vcrs_header[variant]
    print(f"{variant} ({variant_header})\n")

    outfolder = f"{bowtie_read_alignments}/{variant}"
    os.makedirs(outfolder, exist_ok=True)
    
    # Align reads to human ref using bowtie2
    variant_sam = variant + "_reads.sam"
    !$bowtie2 \
        --very-sensitive \
        -k 3 \
        -x $bowtie_reference_prefix \
        -p $threads \
        -q $aligned_reads_parent_dir/$variant/1.fastq \
        -S $outfolder/$variant_sam

    # Convert sam to bam
    variant_bam = variant + "_reads.bam"
    !$samtools view \
        -bS -F4 $outfolder/$variant_sam \
        > $outfolder/$variant_bam
    
    # Sort bam file
    variant_bam_sorted_prefix = "final_" + variant + "_sorted_reads"
    variant_bam_sorted = variant_bam_sorted_prefix + ".bam"
    !$samtools sort \
        $outfolder/$variant_bam \
        $outfolder/$variant_bam_sorted_prefix
    
    # Create an index for the sorted bam file (creates a .bai file)
    !$samtools index $outfolder/$variant_bam_sorted


    # repeat but use vcrs as the input instead of reads
    variant_reference_sequence = vcrs_fasta_indexed[variant_header].seq.strip()

    # Align vcrs to human ref using bowtie2
    variant_sam = variant + "_vcrs.sam"
    !$bowtie2 \
        --very-sensitive \
        -k 3 \
        -x $bowtie_reference_prefix \
        -p $threads \
        -c $variant_reference_sequence \
        -S $outfolder/$variant_sam
    # use k 3 to get multiple alignments, not just the best one

    # Convert sam to bam
    variant_bam = variant + "_vcrs.bam"
    !$samtools view \
        -bS -F4 $outfolder/$variant_sam \
        > $outfolder/$variant_bam

    # Sort bam file
    variant_bam_sorted_prefix = "final_" + variant + "_sorted_vcrs"
    variant_bam_sorted = variant_bam_sorted_prefix + ".bam"
    !$samtools sort \
        $outfolder/$variant_bam \
        $outfolder/$variant_bam_sorted_prefix

    # Create an index for the sorted bam file (creates a .bai file)
    !$samtools index $outfolder/$variant_bam_sorted

vcrs_0 (ENST00000361624:c.795dup)

627 reads; of these:
  627 (100.00%) were unpaired; of these:
    3 (0.48%) aligned 0 times
    0 (0.00%) aligned exactly 1 time
    624 (99.52%) aligned >1 times
99.52% overall alignment rate
[samopen] SAM header is present: 84 sequences.
1 reads; of these:
  1 (100.00%) were unpaired; of these:
    0 (0.00%) aligned 0 times
    0 (0.00%) aligned exactly 1 time
    1 (100.00%) aligned >1 times
100.00% overall alignment rate
[samopen] SAM header is present: 84 sequences.
vcrs_1 (ENST00000361390:c.910T>C)

536 reads; of these:
  536 (100.00%) were unpaired; of these:
    4 (0.75%) aligned 0 times
    0 (0.00%) aligned exactly 1 time
    532 (99.25%) aligned >1 times
99.25% overall alignment rate
[samopen] SAM header is present: 84 sequences.
1 reads; of these:
  1 (100.00%) were unpaired; of these:
    0 (0.00%) aligned 0 times
    0 (0.00%) aligned exactly 1 time
    1 (100.00%) aligned >1 times
100.00% overall alignment rate
[samopen] SAM header is p

### Load the files in bowtie_read_alignments into NCBI Genome workbench (or another genome viewer) to visualize the alignments (the ones with "final_")